From d01893d215f9069849f34c667c467b1112882e64 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Tue, 3 Sep 2024 10:48:07 -0400
Subject: [PATCH] Add GPU markers to fisher f dist

Add SYCL testing of fisher f dist

Add CUDA fisher f dist testing

Add NVRTC fisher f dist testing

Add GPU support to gamma dist

Add SYCL testing of gamma dist

Add CUDA gamma dist testing

Add NVRTC gamma dist testing

Reduce number of threads per block since it can crash CI

Add GPU support to the geometric dist

Add SYCL testing of geometric dist

Add cuda::std::tie

Add GPU support to inv_discrete_quantile

Add CUDA testing of geometric dist

Add NVRTC testing of geometric dist

Add SYCL testing of inverse_chi_squared dist

Adjust tol

Add NVRTC inverse chi squared dist testing

Add CUDA inverse chi squared dist testing

Add GPU support to inverse gamma dist

Add SYCL testing to inverse gamma dist

Add NVRTC testing of inverse gamma dist

Add CUDA testing of inverse gamma dist
---
 .../detail/inv_discrete_quantile.hpp          |  71 ++++---
 include/boost/math/distributions/fisher_f.hpp |  69 +++----
 include/boost/math/distributions/gamma.hpp    |  84 ++++----
 .../boost/math/distributions/geometric.hpp    |  93 +++++----
 .../distributions/inverse_chi_squared.hpp     |  68 +++----
 .../math/distributions/inverse_gamma.hpp      |  77 +++----
 include/boost/math/tools/tuple.hpp            |   1 +
 test/cuda_jamfile                             |  35 ++++
 test/nvrtc_jamfile                            |  35 ++++
 test/sycl_jamfile                             |   5 +
 test/test_arcsine_cdf_double.cu               |   2 +-
 test/test_arcsine_cdf_float.cu                |   2 +-
 test/test_arcsine_pdf_double.cu               |   2 +-
 test/test_arcsine_pdf_float.cu                |   2 +-
 test/test_arcsine_quan_double.cu              |   2 +-
 test/test_arcsine_quan_float.cu               |   2 +-
 test/test_arcsine_range_support_double.cu     |   2 +-
 test/test_arcsine_range_support_float.cu      |   2 +-
 test/test_bernoulli_cdf_double.cu             |   2 +-
 test/test_bernoulli_cdf_float.cu              |   2 +-
 test/test_bernoulli_pdf_double.cu             |   2 +-
 test/test_bernoulli_pdf_float.cu              |   2 +-
 test/test_bernoulli_range_support_double.cu   |   2 +-
 test/test_bernoulli_range_support_float.cu    |   2 +-
 test/test_beta_dist_cdf_double.cu             |   2 +-
 test/test_beta_dist_cdf_float.cu              |   2 +-
 test/test_beta_dist_pdf_double.cu             |   2 +-
 test/test_beta_dist_pdf_float.cu              |   2 +-
 test/test_beta_dist_quan_double.cu            |   2 +-
 test/test_beta_dist_quan_float.cu             |   2 +-
 test/test_cauchy_cdf_double.cu                |   2 +-
 test/test_cauchy_cdf_float.cu                 |   2 +-
 test/test_cauchy_pdf_double.cu                |   2 +-
 test/test_cauchy_pdf_float.cu                 |   2 +-
 test/test_cauchy_quan_double.cu               |   2 +-
 test/test_cauchy_quan_float.cu                |   2 +-
 test/test_cauchy_range_support_double.cu      |   2 +-
 test/test_cauchy_range_support_float.cu       |   2 +-
 test/test_chi_squared_cdf_double.cu           |   2 +-
 test/test_chi_squared_cdf_float.cu            |   2 +-
 test/test_chi_squared_pdf_double.cu           |   2 +-
 test/test_chi_squared_pdf_float.cu            |   2 +-
 test/test_chi_squared_quan_double.cu          |   2 +-
 test/test_chi_squared_quan_float.cu           |   2 +-
 test/test_exponential_cdf_double.cu           |   2 +-
 test/test_exponential_cdf_float.cu            |   2 +-
 test/test_exponential_pdf_double.cu           |   2 +-
 test/test_exponential_pdf_float.cu            |   2 +-
 test/test_exponential_quan_double.cu          |   2 +-
 test/test_exponential_quan_float.cu           |   2 +-
 test/test_exponential_range_support_double.cu |   2 +-
 test/test_exponential_range_support_float.cu  |   2 +-
 test/test_extreme_value_cdf_double.cu         |   2 +-
 test/test_extreme_value_cdf_float.cu          |   2 +-
 test/test_extreme_value_pdf_double.cu         |   2 +-
 test/test_extreme_value_pdf_float.cu          |   2 +-
 test/test_extreme_value_quan_double.cu        |   2 +-
 test/test_extreme_value_quan_float.cu         |   2 +-
 test/test_fisher_f.cpp                        |   6 +-
 test/test_fisher_f_cdf_double.cu              | 109 ++++++++++
 test/test_fisher_f_cdf_float.cu               | 109 ++++++++++
 test/test_fisher_f_cdf_nvrtc_double.cpp       | 191 ++++++++++++++++++
 test/test_fisher_f_cdf_nvrtc_float.cpp        | 191 ++++++++++++++++++
 test/test_fisher_f_pdf_double.cu              | 109 ++++++++++
 test/test_fisher_f_pdf_float.cu               | 109 ++++++++++
 test/test_fisher_f_pdf_nvrtc_double.cpp       | 191 ++++++++++++++++++
 test/test_fisher_f_pdf_nvrtc_float.cpp        | 191 ++++++++++++++++++
 test/test_fisher_f_quan_double.cu             | 109 ++++++++++
 test/test_fisher_f_quan_float.cu              | 109 ++++++++++
 test/test_fisher_f_quan_nvrtc_double.cpp      | 191 ++++++++++++++++++
 test/test_fisher_f_quan_nvrtc_float.cpp       | 191 ++++++++++++++++++
 test/test_gamma_dist.cpp                      |   9 +-
 test/test_gamma_dist_cdf_double.cu            | 109 ++++++++++
 test/test_gamma_dist_cdf_float.cu             | 109 ++++++++++
 test/test_gamma_dist_cdf_nvrtc_double.cpp     | 191 ++++++++++++++++++
 test/test_gamma_dist_cdf_nvrtc_float.cpp      | 191 ++++++++++++++++++
 test/test_gamma_dist_pdf_double.cu            | 109 ++++++++++
 test/test_gamma_dist_pdf_float.cu             | 109 ++++++++++
 test/test_gamma_dist_pdf_nvrtc_double.cpp     | 191 ++++++++++++++++++
 test/test_gamma_dist_pdf_nvrtc_float.cpp      | 191 ++++++++++++++++++
 test/test_gamma_dist_quan_double.cu           | 109 ++++++++++
 test/test_gamma_dist_quan_float.cu            | 109 ++++++++++
 test/test_gamma_dist_quan_nvrtc_double.cpp    | 191 ++++++++++++++++++
 test/test_gamma_dist_quan_nvrtc_float.cpp     | 191 ++++++++++++++++++
 test/test_geometric.cpp                       |  18 +-
 test/test_geometric_dist_cdf_double.cu        | 109 ++++++++++
 test/test_geometric_dist_cdf_float.cu         | 109 ++++++++++
 test/test_geometric_dist_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_cdf_nvrtc_float.cpp  | 191 ++++++++++++++++++
 test/test_geometric_dist_pdf_double.cu        | 109 ++++++++++
 test/test_geometric_dist_pdf_float.cu         | 109 ++++++++++
 test/test_geometric_dist_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_pdf_nvrtc_float.cpp  | 191 ++++++++++++++++++
 test/test_geometric_dist_quan_double.cu       | 109 ++++++++++
 test/test_geometric_dist_quan_float.cu        | 109 ++++++++++
 .../test_geometric_dist_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_geometric_dist_quan_nvrtc_float.cpp | 191 ++++++++++++++++++
 test/test_holtsmark_cdf_double.cu             |   2 +-
 test/test_holtsmark_cdf_float.cu              |   2 +-
 test/test_holtsmark_pdf_double.cu             |   2 +-
 test/test_holtsmark_pdf_float.cu              |   2 +-
 test/test_inverse_chi_squared_cdf_double.cu   | 110 ++++++++++
 test/test_inverse_chi_squared_cdf_float.cu    | 110 ++++++++++
 ...t_inverse_chi_squared_cdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...st_inverse_chi_squared_cdf_nvrtc_float.cpp | 191 ++++++++++++++++++
 .../test_inverse_chi_squared_distribution.cpp |   7 +-
 test/test_inverse_chi_squared_pdf_double.cu   | 110 ++++++++++
 test/test_inverse_chi_squared_pdf_float.cu    | 110 ++++++++++
 ...t_inverse_chi_squared_pdf_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...st_inverse_chi_squared_pdf_nvrtc_float.cpp | 191 ++++++++++++++++++
 test/test_inverse_chi_squared_quan_double.cu  | 110 ++++++++++
 test/test_inverse_chi_squared_quan_float.cu   | 110 ++++++++++
 ..._inverse_chi_squared_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 ...t_inverse_chi_squared_quan_nvrtc_float.cpp | 191 ++++++++++++++++++
 test/test_inverse_gamma_cdf_double.cu         | 110 ++++++++++
 test/test_inverse_gamma_cdf_float.cu          | 110 ++++++++++
 test/test_inverse_gamma_cdf_nvrtc_double.cpp  | 191 ++++++++++++++++++
 test/test_inverse_gamma_cdf_nvrtc_float.cpp   | 191 ++++++++++++++++++
 test/test_inverse_gamma_distribution.cpp      |   7 +-
 test/test_inverse_gamma_pdf_double.cu         | 110 ++++++++++
 test/test_inverse_gamma_pdf_float.cu          | 110 ++++++++++
 test/test_inverse_gamma_pdf_nvrtc_double.cpp  | 191 ++++++++++++++++++
 test/test_inverse_gamma_pdf_nvrtc_float.cpp   | 191 ++++++++++++++++++
 test/test_inverse_gamma_quan_double.cu        | 110 ++++++++++
 test/test_inverse_gamma_quan_float.cu         | 110 ++++++++++
 test/test_inverse_gamma_quan_nvrtc_double.cpp | 191 ++++++++++++++++++
 test/test_inverse_gamma_quan_nvrtc_float.cpp  | 191 ++++++++++++++++++
 test/test_landau_cdf_double.cu                |   2 +-
 test/test_landau_cdf_float.cu                 |   2 +-
 test/test_landau_pdf_double.cu                |   2 +-
 test/test_landau_pdf_float.cu                 |   2 +-
 test/test_landau_quan_double.cu               |   2 +-
 test/test_landau_quan_float.cu                |   2 +-
 test/test_laplace_cdf_double.cu               |   2 +-
 test/test_laplace_cdf_float.cu                |   2 +-
 test/test_laplace_pdf_double.cu               |   2 +-
 test/test_laplace_pdf_float.cu                |   2 +-
 test/test_laplace_quan_double.cu              |   2 +-
 test/test_laplace_quan_float.cu               |   2 +-
 test/test_logistic_cdf_double.cu              |   2 +-
 test/test_logistic_cdf_float.cu               |   2 +-
 test/test_logistic_pdf_double.cu              |   2 +-
 test/test_logistic_pdf_float.cu               |   2 +-
 test/test_logistic_quan_double.cu             |   2 +-
 test/test_logistic_quan_float.cu              |   2 +-
 test/test_mapairy_cdf_double.cu               |   2 +-
 test/test_mapairy_cdf_float.cu                |   2 +-
 test/test_mapairy_pdf_double.cu               |   2 +-
 test/test_mapairy_pdf_float.cu                |   2 +-
 test/test_mapairy_quan_double.cu              |   2 +-
 test/test_mapairy_quan_float.cu               |   2 +-
 test/test_saspoint5_cdf_double.cu             |   2 +-
 test/test_saspoint5_cdf_float.cu              |   2 +-
 test/test_saspoint5_pdf_double.cu             |   2 +-
 test/test_saspoint5_pdf_float.cu              |   2 +-
 test/test_saspoint5_quan_double.cu            |   2 +-
 test/test_saspoint5_quan_float.cu             |   2 +-
 test/test_weibull_cdf_double.cu               |   2 +-
 test/test_weibull_cdf_float.cu                |   2 +-
 test/test_weibull_pdf_double.cu               |   2 +-
 test/test_weibull_pdf_float.cu                |   2 +-
 test/test_weibull_quan_double.cu              |   2 +-
 test/test_weibull_quan_float.cu               |   2 +-
 163 files changed, 9449 insertions(+), 324 deletions(-)
 create mode 100644 test/test_fisher_f_cdf_double.cu
 create mode 100644 test/test_fisher_f_cdf_float.cu
 create mode 100644 test/test_fisher_f_cdf_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_cdf_nvrtc_float.cpp
 create mode 100644 test/test_fisher_f_pdf_double.cu
 create mode 100644 test/test_fisher_f_pdf_float.cu
 create mode 100644 test/test_fisher_f_pdf_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_pdf_nvrtc_float.cpp
 create mode 100644 test/test_fisher_f_quan_double.cu
 create mode 100644 test/test_fisher_f_quan_float.cu
 create mode 100644 test/test_fisher_f_quan_nvrtc_double.cpp
 create mode 100644 test/test_fisher_f_quan_nvrtc_float.cpp
 create mode 100644 test/test_gamma_dist_cdf_double.cu
 create mode 100644 test/test_gamma_dist_cdf_float.cu
 create mode 100644 test/test_gamma_dist_cdf_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_cdf_nvrtc_float.cpp
 create mode 100644 test/test_gamma_dist_pdf_double.cu
 create mode 100644 test/test_gamma_dist_pdf_float.cu
 create mode 100644 test/test_gamma_dist_pdf_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_pdf_nvrtc_float.cpp
 create mode 100644 test/test_gamma_dist_quan_double.cu
 create mode 100644 test/test_gamma_dist_quan_float.cu
 create mode 100644 test/test_gamma_dist_quan_nvrtc_double.cpp
 create mode 100644 test/test_gamma_dist_quan_nvrtc_float.cpp
 create mode 100644 test/test_geometric_dist_cdf_double.cu
 create mode 100644 test/test_geometric_dist_cdf_float.cu
 create mode 100644 test/test_geometric_dist_cdf_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_cdf_nvrtc_float.cpp
 create mode 100644 test/test_geometric_dist_pdf_double.cu
 create mode 100644 test/test_geometric_dist_pdf_float.cu
 create mode 100644 test/test_geometric_dist_pdf_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_pdf_nvrtc_float.cpp
 create mode 100644 test/test_geometric_dist_quan_double.cu
 create mode 100644 test/test_geometric_dist_quan_float.cu
 create mode 100644 test/test_geometric_dist_quan_nvrtc_double.cpp
 create mode 100644 test/test_geometric_dist_quan_nvrtc_float.cpp
 create mode 100644 test/test_inverse_chi_squared_cdf_double.cu
 create mode 100644 test/test_inverse_chi_squared_cdf_float.cu
 create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_chi_squared_pdf_double.cu
 create mode 100644 test/test_inverse_chi_squared_pdf_float.cu
 create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_chi_squared_quan_double.cu
 create mode 100644 test/test_inverse_chi_squared_quan_float.cu
 create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_double.cpp
 create mode 100644 test/test_inverse_chi_squared_quan_nvrtc_float.cpp
 create mode 100644 test/test_inverse_gamma_cdf_double.cu
 create mode 100644 test/test_inverse_gamma_cdf_float.cu
 create mode 100644 test/test_inverse_gamma_cdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_cdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_gamma_pdf_double.cu
 create mode 100644 test/test_inverse_gamma_pdf_float.cu
 create mode 100644 test/test_inverse_gamma_pdf_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_pdf_nvrtc_float.cpp
 create mode 100644 test/test_inverse_gamma_quan_double.cu
 create mode 100644 test/test_inverse_gamma_quan_float.cu
 create mode 100644 test/test_inverse_gamma_quan_nvrtc_double.cpp
 create mode 100644 test/test_inverse_gamma_quan_nvrtc_float.cpp

diff --git a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
index 739a86666..f688345b7 100644
--- a/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
+++ b/include/boost/math/distributions/detail/inv_discrete_quantile.hpp
@@ -6,7 +6,11 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE
 #define BOOST_MATH_DISTRIBUTIONS_DETAIL_INV_DISCRETE_QUANTILE
 
-#include <algorithm>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/cstdint.hpp>
+#include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/toms748_solve.hpp>
+#include <boost/math/tools/tuple.hpp>
 
 namespace boost{ namespace math{ namespace detail{
 
@@ -19,10 +23,10 @@ struct distribution_quantile_finder
    typedef typename Dist::value_type value_type;
    typedef typename Dist::policy_type policy_type;
 
-   distribution_quantile_finder(const Dist d, value_type p, bool c)
+   BOOST_MATH_GPU_ENABLED distribution_quantile_finder(const Dist d, value_type p, bool c)
       : dist(d), target(p), comp(c) {}
 
-   value_type operator()(value_type const& x)
+   BOOST_MATH_GPU_ENABLED value_type operator()(value_type const& x)
    {
       return comp ? value_type(target - cdf(complement(dist, x))) : value_type(cdf(dist, x) - target);
    }
@@ -42,24 +46,24 @@ private:
 // in the root no longer being bracketed.
 //
 template <class Real, class Tol>
-void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){}
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& /* b */, Tol const& /* tol */){}
 
 template <class Real>
-void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& /* a */, Real& b, tools::equal_floor const& /* tol */)
 {
    BOOST_MATH_STD_USING
    b -= tools::epsilon<Real>() * b;
 }
 
 template <class Real>
-void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& /* b */, tools::equal_ceil const& /* tol */)
 {
    BOOST_MATH_STD_USING
    a += tools::epsilon<Real>() * a;
 }
 
 template <class Real>
-void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */)
+BOOST_MATH_GPU_ENABLED void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol */)
 {
    BOOST_MATH_STD_USING
    a += tools::epsilon<Real>() * a;
@@ -69,7 +73,7 @@ void adjust_bounds(Real& a, Real& b, tools::equal_nearest_integer const& /* tol
 // This is where all the work is done:
 //
 template <class Dist, class Tolerance>
-typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED typename Dist::value_type 
    do_inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -78,7 +82,7 @@ typename Dist::value_type
       const typename Dist::value_type& multiplier,
       typename Dist::value_type adder,
       const Tolerance& tol,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    typedef typename Dist::policy_type policy_type;
@@ -100,7 +104,7 @@ typename Dist::value_type
       guess = min_bound;
 
    value_type fa = f(guess);
-   std::uintmax_t count = max_iter - 1;
+   boost::math::uintmax_t count = max_iter - 1;
    value_type fb(fa), a(guess), b =0; // Compiler warning C4701: potentially uninitialized local variable 'b' used
 
    if(fa == 0)
@@ -130,7 +134,7 @@ typename Dist::value_type
          else
          {
             b = a;
-            a = (std::max)(value_type(b - 1), value_type(0));
+            a = BOOST_MATH_GPU_SAFE_MAX(value_type(b - 1), value_type(0));
             if(a < min_bound)
                a = min_bound;
             fa = f(a);
@@ -153,7 +157,7 @@ typename Dist::value_type
       // If we're looking for a large result, then bump "adder" up
       // by a bit to increase our chances of bracketing the root:
       //
-      //adder = (std::max)(adder, 0.001f * guess);
+      //adder = BOOST_MATH_GPU_SAFE_MAX(adder, 0.001f * guess);
       if(fa < 0)
       {
          b = a + adder;
@@ -162,7 +166,7 @@ typename Dist::value_type
       }
       else
       {
-         b = (std::max)(value_type(a - adder), value_type(0));
+         b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0));
          if(b < min_bound)
             b = min_bound;
       }
@@ -186,7 +190,7 @@ typename Dist::value_type
          }
          else
          {
-            b = (std::max)(value_type(a - adder), value_type(0));
+            b = BOOST_MATH_GPU_SAFE_MAX(value_type(a - adder), value_type(0));
             if(b < min_bound)
                b = min_bound;
          }
@@ -195,9 +199,8 @@ typename Dist::value_type
       }
       if(a > b)
       {
-         using std::swap;
-         swap(a, b);
-         swap(fa, fb);
+         BOOST_MATH_GPU_SAFE_SWAP(a, b);
+         BOOST_MATH_GPU_SAFE_SWAP(fa, fb);
       }
    }
    //
@@ -274,7 +277,7 @@ typename Dist::value_type
    //
    // Go ahead and find the root:
    //
-   std::pair<value_type, value_type> r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type());
+   boost::math::pair<value_type, value_type> r = toms748_solve(f, a, b, fa, fb, tol, count, policy_type());
    max_iter += count;
    if (max_iter >= policies::get_max_root_iterations<policy_type>())
    {
@@ -293,7 +296,7 @@ typename Dist::value_type
 // is very close 1.
 //
 template <class Dist>
-inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type cc = ceil(result);
@@ -325,7 +328,7 @@ inline typename Dist::value_type round_to_floor(const Dist& d, typename Dist::va
 #endif
 
 template <class Dist>
-inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::value_type result, typename Dist::value_type p, bool c)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type cc = floor(result);
@@ -339,7 +342,11 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val
    //
    while(true)
    {
+      #ifdef BOOST_MATH_HAS_GPU_SUPPORT
+      cc = ceil(nextafter(result, tools::max_value<typename Dist::value_type>()));
+      #else
       cc = ceil(float_next(result));
+      #endif
       if(cc > support(d).second)
          break;
       pp = c ? cdf(complement(d, cc)) : cdf(d, cc);
@@ -362,7 +369,7 @@ inline typename Dist::value_type round_to_ceil(const Dist& d, typename Dist::val
 // to an int where required.
 //
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       typename Dist::value_type p,
@@ -371,7 +378,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::real>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    if(p > 0.5)
    {
@@ -393,7 +400,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -402,7 +409,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_outwards>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -436,7 +443,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -445,7 +452,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_inwards>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -479,7 +486,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -488,7 +495,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_down>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
@@ -507,7 +514,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -516,7 +523,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_up>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    BOOST_MATH_STD_USING
    typename Dist::value_type pp = c ? 1 - p : p;
@@ -534,7 +541,7 @@ inline typename Dist::value_type
 }
 
 template <class Dist>
-inline typename Dist::value_type 
+BOOST_MATH_GPU_ENABLED inline typename Dist::value_type 
    inverse_discrete_quantile(
       const Dist& dist,
       const typename Dist::value_type& p,
@@ -543,7 +550,7 @@ inline typename Dist::value_type
       const typename Dist::value_type& multiplier,
       const typename Dist::value_type& adder,
       const policies::discrete_quantile<policies::integer_round_nearest>&,
-      std::uintmax_t& max_iter)
+      boost::math::uintmax_t& max_iter)
 {
    typedef typename Dist::value_type value_type;
    BOOST_MATH_STD_USING
diff --git a/include/boost/math/distributions/fisher_f.hpp b/include/boost/math/distributions/fisher_f.hpp
index e22cdf50a..56b288d88 100644
--- a/include/boost/math/distributions/fisher_f.hpp
+++ b/include/boost/math/distributions/fisher_f.hpp
@@ -1,5 +1,5 @@
 // Copyright John Maddock 2006.
-
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -8,14 +8,15 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP
 #define BOOST_MATH_DISTRIBUTIONS_FISHER_F_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/promotion.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for incomplete beta.
 #include <boost/math/distributions/complement.hpp> // complements
 #include <boost/math/distributions/detail/common_error_handling.hpp> // error checks
 #include <boost/math/special_functions/fpclassify.hpp>
 
-#include <utility>
-
 namespace boost{ namespace math{
 
 template <class RealType = double, class Policy = policies::policy<> >
@@ -25,9 +26,9 @@ public:
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j)
+   BOOST_MATH_GPU_ENABLED fisher_f_distribution(const RealType& i, const RealType& j) : m_df1(i), m_df2(j)
    {
-      static const char* function = "fisher_f_distribution<%1%>::fisher_f_distribution";
+      constexpr auto function = "fisher_f_distribution<%1%>::fisher_f_distribution";
       RealType result;
       detail::check_df(
          function, m_df1, &result, Policy());
@@ -35,11 +36,11 @@ public:
          function, m_df2, &result, Policy());
    } // fisher_f_distribution
 
-   RealType degrees_of_freedom1()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom1()const
    {
       return m_df1;
    }
-   RealType degrees_of_freedom2()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom2()const
    {
       return m_df2;
    }
@@ -60,29 +61,29 @@ fisher_f_distribution(RealType,RealType)->fisher_f_distribution<typename boost::
 #endif
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> range(const fisher_f_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const fisher_f_distribution<RealType, Policy>& /*dist*/)
 { // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> support(const fisher_f_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const fisher_f_distribution<RealType, Policy>& /*dist*/)
 { // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
    RealType error_result = 0;
-   static const char* function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::pdf(fisher_f_distribution<%1%> const&, %1%)";
    if(false == (detail::check_df(
          function, df1, &error_result, Policy())
          && detail::check_df(
@@ -132,9 +133,9 @@ RealType pdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const RealType& x)
 {
-   static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -167,9 +168,9 @@ inline RealType cdf(const fisher_f_distribution<RealType, Policy>& dist, const R
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, const RealType& p)
 {
-   static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -192,9 +193,9 @@ inline RealType quantile(const fisher_f_distribution<RealType, Policy>& dist, co
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
 {
-   static const char* function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::cdf(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = c.dist.degrees_of_freedom1();
    RealType df2 = c.dist.degrees_of_freedom2();
    RealType x = c.param;
@@ -228,9 +229,9 @@ inline RealType cdf(const complemented2_type<fisher_f_distribution<RealType, Pol
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType, Policy>, RealType>& c)
 {
-   static const char* function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
+   constexpr auto function = "boost::math::quantile(fisher_f_distribution<%1%> const&, %1%)";
    RealType df1 = c.dist.degrees_of_freedom1();
    RealType df2 = c.dist.degrees_of_freedom2();
    RealType p = c.param;
@@ -252,9 +253,9 @@ inline RealType quantile(const complemented2_type<fisher_f_distribution<RealType
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
 { // Mean of F distribution = v.
-   static const char* function = "boost::math::mean(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::mean(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -273,9 +274,9 @@ inline RealType mean(const fisher_f_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
 { // Variance of F distribution.
-   static const char* function = "boost::math::variance(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::variance(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -294,9 +295,9 @@ inline RealType variance(const fisher_f_distribution<RealType, Policy>& dist)
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::mode(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::mode(fisher_f_distribution<%1%> const&)";
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
    // Error check:
@@ -317,15 +318,15 @@ inline RealType mode(const fisher_f_distribution<RealType, Policy>& dist)
 //template <class RealType, class Policy>
 //inline RealType median(const fisher_f_distribution<RealType, Policy>& dist)
 //{ // Median of Fisher F distribution is not defined.
-//  return tools::domain_error<RealType>(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", std::numeric_limits<RealType>::quiet_NaN());
+//  return tools::domain_error<RealType>(BOOST_CURRENT_FUNCTION, "Median is not implemented, result is %1%!", boost::math::numeric_limits<RealType>::quiet_NaN());
 //  } // median
 
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::skewness(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::skewness(fisher_f_distribution<%1%> const&)";
    BOOST_MATH_STD_USING // ADL of std names
    // See http://mathworld.wolfram.com/F-Distribution.html
    RealType df1 = dist.degrees_of_freedom1();
@@ -346,18 +347,18 @@ inline RealType skewness(const fisher_f_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist);
+BOOST_MATH_GPU_ENABLED RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist);
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const fisher_f_distribution<RealType, Policy>& dist)
 {
    return 3 + kurtosis_excess(dist);
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const fisher_f_distribution<RealType, Policy>& dist)
 {
-   static const char* function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)";
+   constexpr auto function = "boost::math::kurtosis_excess(fisher_f_distribution<%1%> const&)";
    // See http://mathworld.wolfram.com/F-Distribution.html
    RealType df1 = dist.degrees_of_freedom1();
    RealType df2 = dist.degrees_of_freedom2();
diff --git a/include/boost/math/distributions/gamma.hpp b/include/boost/math/distributions/gamma.hpp
index 28b7c55b0..5176f906d 100644
--- a/include/boost/math/distributions/gamma.hpp
+++ b/include/boost/math/distributions/gamma.hpp
@@ -1,4 +1,5 @@
 //  Copyright John Maddock 2006.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,22 +11,22 @@
 // http://mathworld.wolfram.com/GammaDistribution.html
 // http://en.wikipedia.org/wiki/Gamma_distribution
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/special_functions/digamma.hpp>
 #include <boost/math/distributions/detail/common_error_handling.hpp>
 #include <boost/math/distributions/complement.hpp>
 
-#include <utility>
-#include <type_traits>
-
 namespace boost{ namespace math
 {
 namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_gamma_shape(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma_shape(
       const char* function,
       RealType shape,
       RealType* result, const Policy& pol)
@@ -41,7 +42,7 @@ inline bool check_gamma_shape(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma_x(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -57,7 +58,7 @@ inline bool check_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma(
+BOOST_MATH_GPU_ENABLED inline bool check_gamma(
       const char* function,
       RealType scale,
       RealType shape,
@@ -75,19 +76,19 @@ public:
    using value_type = RealType;
    using policy_type = Policy;
 
-   explicit gamma_distribution(RealType l_shape, RealType l_scale = 1)
+   BOOST_MATH_GPU_ENABLED explicit gamma_distribution(RealType l_shape, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
       detail::check_gamma("boost::math::gamma_distribution<%1%>::gamma_distribution", l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_MATH_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -109,27 +110,27 @@ gamma_distribution(RealType,RealType)->gamma_distribution<typename boost::math::
 #endif
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> range(const gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> range(const gamma_distribution<RealType, Policy>& /* dist */)
 { // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> support(const gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> support(const gamma_distribution<RealType, Policy>& /* dist */)
 { // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
    using boost::math::tools::min_value;
-   return std::pair<RealType, RealType>(min_value<RealType>(),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(min_value<RealType>(),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -149,17 +150,17 @@ inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const Real
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::lgamma;
 
-   static const char* function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::logpdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType k = dist.shape();
    RealType theta = dist.scale();
 
-   RealType result = -std::numeric_limits<RealType>::infinity();
+   RealType result = -boost::math::numeric_limits<RealType>::infinity();
    if(false == detail::check_gamma(function, theta, k, &result, Policy()))
       return result;
    if(false == detail::check_gamma_x(function, x, &result, Policy()))
@@ -167,7 +168,7 @@ inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const R
 
    if(x == 0)
    {
-      return std::numeric_limits<RealType>::quiet_NaN();
+      return boost::math::numeric_limits<RealType>::quiet_NaN();
    }
 
    result = -k*log(theta) + (k-1)*log(x) - lgamma(k) - (x/theta);
@@ -176,11 +177,11 @@ inline RealType logpdf(const gamma_distribution<RealType, Policy>& dist, const R
 } // logpdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -196,11 +197,11 @@ inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const Real
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -220,11 +221,11 @@ inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -241,11 +242,11 @@ inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -266,11 +267,11 @@ inline RealType quantile(const complemented2_type<gamma_distribution<RealType, P
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -284,11 +285,11 @@ inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -302,11 +303,11 @@ inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -331,11 +332,11 @@ inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -349,11 +350,11 @@ inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -367,18 +368,19 @@ inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
 {
    return kurtosis_excess(dist) + 3;
 }
 
 template <class RealType, class Policy>
-inline RealType entropy(const gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType entropy(const gamma_distribution<RealType, Policy>& dist)
 {
+   BOOST_MATH_STD_USING
+
    RealType k = dist.shape();
    RealType theta = dist.scale();
-   using std::log;
-   return k + log(theta) + lgamma(k) + (1-k)*digamma(k);
+   return k + log(theta) + boost::math::lgamma(k) + (1-k)*digamma(k);
 }
 
 } // namespace math
diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp
index 7c511ef2d..0a7b383c2 100644
--- a/include/boost/math/distributions/geometric.hpp
+++ b/include/boost/math/distributions/geometric.hpp
@@ -36,6 +36,9 @@
 #ifndef BOOST_MATH_SPECIAL_GEOMETRIC_HPP
 #define BOOST_MATH_SPECIAL_GEOMETRIC_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/beta.hpp> // for ibeta(a, b, x) == Ix(a, b).
 #include <boost/math/distributions/complement.hpp> // complement.
@@ -45,10 +48,6 @@
 #include <boost/math/distributions/detail/inv_discrete_quantile.hpp>
 #include <boost/math/special_functions/log1p.hpp>
 
-#include <limits> // using std::numeric_limits;
-#include <utility>
-#include <cmath>
-
 #if defined (BOOST_MSVC)
 #  pragma warning(push)
 // This believed not now necessary, so commented out.
@@ -64,7 +63,7 @@ namespace boost
     {
       // Common error checking routines for geometric distribution function:
       template <class RealType, class Policy>
-      inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         if( !(boost::math::isfinite)(p) || (p < 0) || (p > 1) )
         {
@@ -77,13 +76,13 @@ namespace boost
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         return check_success_fraction(function, p, result, pol);
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
       {
         if(check_dist(function, p, result, pol) == false)
         {
@@ -100,7 +99,7 @@ namespace boost
       } // Check_dist_and_k
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
+      BOOST_MATH_GPU_ENABLED inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
       {
         if((check_dist(function, p, result, pol) && detail::check_probability(function, prob, result, pol)) == false)
         {
@@ -117,7 +116,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      geometric_distribution(RealType p) : m_p(p)
+      BOOST_MATH_GPU_ENABLED geometric_distribution(RealType p) : m_p(p)
       { // Constructor stores success_fraction p.
         RealType result;
         geometric_detail::check_dist(
@@ -127,22 +126,22 @@ namespace boost
       } // geometric_distribution constructor.
 
       // Private data getter class member functions.
-      RealType success_fraction() const
+      BOOST_MATH_GPU_ENABLED RealType success_fraction() const
       { // Probability of success as fraction in range 0 to 1.
         return m_p;
       }
-      RealType successes() const
+      BOOST_MATH_GPU_ENABLED RealType successes() const
       { // Total number of successes r = 1 (for compatibility with negative binomial?).
         return 1;
       }
 
       // Parameter estimation.
       // (These are copies of negative_binomial distribution with successes = 1).
-      static RealType find_lower_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_lower_bound_on_p(
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
+        constexpr auto function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -163,11 +162,11 @@ namespace boost
         return ibeta_inv(successes, failures + 1, alpha, static_cast<RealType*>(nullptr), Policy());
       } // find_lower_bound_on_p
 
-      static RealType find_upper_bound_on_p(
+      BOOST_MATH_GPU_ENABLED static RealType find_upper_bound_on_p(
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
+        constexpr auto function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -195,12 +194,12 @@ namespace boost
       // Estimate number of trials :
       // "How many trials do I need to be P% sure of seeing k or fewer failures?"
 
-      static RealType find_minimum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_minimum_number_of_trials(
         RealType k,     // number of failures (k >= 0).
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
+        constexpr auto function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -213,12 +212,12 @@ namespace boost
         return result + k;
       } // RealType find_number_of_failures
 
-      static RealType find_maximum_number_of_trials(
+      BOOST_MATH_GPU_ENABLED static RealType find_maximum_number_of_trials(
         RealType k,     // number of failures (k >= 0).
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
+        constexpr auto function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -244,22 +243,22 @@ namespace boost
     #endif
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> range(const geometric_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const geometric_distribution<RealType, Policy>& /* dist */)
     { // Range of permissible values for random variable k.
        using boost::math::tools::max_value;
-       return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // max_integer?
+       return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // max_integer?
     }
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> support(const geometric_distribution<RealType, Policy>& /* dist */)
+    BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const geometric_distribution<RealType, Policy>& /* dist */)
     { // Range of supported values for random variable k.
        // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
        using boost::math::tools::max_value;
-       return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>()); // max_integer?
+       return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>()); // max_integer?
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType mean(const geometric_distribution<RealType, Policy>& dist)
     { // Mean of geometric distribution = (1-p)/p.
       return (1 - dist.success_fraction() ) / dist.success_fraction();
     } // mean
@@ -267,21 +266,21 @@ namespace boost
     // median implemented via quantile(half) in derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType mode(const geometric_distribution<RealType, Policy>&)
+    BOOST_MATH_GPU_ENABLED inline RealType mode(const geometric_distribution<RealType, Policy>&)
     { // Mode of geometric distribution = zero.
       BOOST_MATH_STD_USING // ADL of std functions.
       return 0;
     } // mode
 
     template <class RealType, class Policy>
-    inline RealType variance(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType variance(const geometric_distribution<RealType, Policy>& dist)
     { // Variance of Binomial distribution = (1-p) / p^2.
       return  (1 - dist.success_fraction())
         / (dist.success_fraction() * dist.success_fraction());
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType skewness(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType skewness(const geometric_distribution<RealType, Policy>& dist)
     { // skewness of geometric distribution = 2-p / (sqrt(r(1-p))
       BOOST_MATH_STD_USING // ADL of std functions.
       RealType p = dist.success_fraction();
@@ -289,7 +288,7 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis of geometric distribution
       // http://en.wikipedia.org/wiki/geometric is kurtosis_excess so add 3
       RealType p = dist.success_fraction();
@@ -297,7 +296,7 @@ namespace boost
     } // kurtosis
 
      template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
+    BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis excess of geometric distribution
       // http://mathworld.wolfram.com/Kurtosis.html table of kurtosis_excess
       RealType p = dist.success_fraction();
@@ -312,11 +311,11 @@ namespace boost
     // chf of geometric distribution provided by derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Probability Density/Mass Function.
       BOOST_FPU_EXCEPTION_GUARD
       BOOST_MATH_STD_USING  // For ADL of math functions.
-      static const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
 
       RealType p = dist.success_fraction();
       RealType result = 0;
@@ -350,9 +349,9 @@ namespace boost
     } // geometric_pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function of geometric.
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
 
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
@@ -381,12 +380,10 @@ namespace boost
     } // cdf Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType logcdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_MATH_GPU_ENABLED inline RealType logcdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function of geometric.
-      using std::pow;
-      using std::log;
-      using std::exp;
-      static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_STD_USING
+      constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
 
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
@@ -399,7 +396,7 @@ namespace boost
         k,
         &result, Policy()))
       {
-        return -std::numeric_limits<RealType>::infinity();
+        return -boost::math::numeric_limits<RealType>::infinity();
       }
       if(k == 0)
       {
@@ -413,10 +410,10 @@ namespace boost
     } // logcdf Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function geometric.
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
       RealType const& k = c.param;
@@ -438,10 +435,10 @@ namespace boost
     } // cdf Complemented Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType logcdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType logcdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function geometric.
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::logcdf(const geometric_distribution<%1%>&, %1%)";
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
       RealType const& k = c.param;
@@ -455,21 +452,21 @@ namespace boost
         k,
         &result, Policy()))
       {
-        return -std::numeric_limits<RealType>::infinity();
+        return -boost::math::numeric_limits<RealType>::infinity();
       }
 
       return boost::math::log1p(-p, Policy()) * (k+1);
     } // logcdf Complemented Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
     { // Quantile, percentile/100 or Percent Point geometric function.
       // Return the number of expected failures k for a given probability p.
 
       // Inverse cumulative Distribution Function or Quantile (percentile / 100) of geometric Probability.
       // k argument may be integral, signed, or unsigned, or floating point.
 
-      static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+      constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
       BOOST_MATH_STD_USING // ADL of std functions.
 
       RealType success_fraction = dist.success_fraction();
@@ -513,11 +510,11 @@ namespace boost
     } // RealType quantile(const geometric_distribution dist, p)
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     {  // Quantile or Percent Point Binomial function.
        // Return the number of expected failures k for a given
        // complement of the probability Q = 1 - P.
-       static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+       constexpr auto function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
        BOOST_MATH_STD_USING
        // Error checks:
        RealType x = c.param;
diff --git a/include/boost/math/distributions/inverse_chi_squared.hpp b/include/boost/math/distributions/inverse_chi_squared.hpp
index 19dd0371e..1a3c680d2 100644
--- a/include/boost/math/distributions/inverse_chi_squared.hpp
+++ b/include/boost/math/distributions/inverse_chi_squared.hpp
@@ -1,6 +1,6 @@
 // Copyright John Maddock 2010.
 // Copyright Paul A. Bristow 2010.
-
+// Copyright Matt Borland 2024.
 // Use, modification and distribution are subject to the
 // Boost Software License, Version 1.0.
 // (See accompanying file LICENSE_1_0.txt
@@ -9,6 +9,8 @@
 #ifndef BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP
 #define BOOST_MATH_DISTRIBUTIONS_INVERSE_CHI_SQUARED_HPP
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp> // for incomplete beta.
 #include <boost/math/distributions/complement.hpp> // for complements.
@@ -24,14 +26,12 @@
 // Weisstein, Eric W. "Inverse Chi-Squared Distribution." From MathWorld--A Wolfram Web Resource.
 // http://mathworld.wolfram.com/InverseChi-SquaredDistribution.html
 
-#include <utility>
-
 namespace boost{ namespace math{
 
 namespace detail
 {
   template <class RealType, class Policy>
-  inline bool check_inverse_chi_squared( // Check both distribution parameters.
+  BOOST_MATH_GPU_ENABLED inline bool check_inverse_chi_squared( // Check both distribution parameters.
         const char* function,
         RealType degrees_of_freedom, // degrees_of_freedom (aka nu).
         RealType scale,  // scale (aka sigma^2)
@@ -51,7 +51,7 @@ public:
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
    {
       RealType result;
       detail::check_df(
@@ -62,7 +62,7 @@ public:
          m_scale, &result,  Policy());
    } // inverse_chi_squared_distribution constructor 
 
-   inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
+   BOOST_MATH_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
    {
       RealType result;
       m_scale = 1 / m_df ; // Default scale = 1 / degrees of freedom (Wikipedia definition 1).
@@ -71,11 +71,11 @@ public:
          m_df, &result, Policy());
    } // inverse_chi_squared_distribution
 
-   RealType degrees_of_freedom()const
+   BOOST_MATH_GPU_ENABLED RealType degrees_of_freedom()const
    {
       return m_df; // aka nu
    }
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;  // aka xi
    }
@@ -105,28 +105,28 @@ inverse_chi_squared_distribution(RealType,RealType)->inverse_chi_squared_distrib
 #endif
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> range(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> range(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
 {  // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // 0 to + infinity.
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>()); // 0 to + infinity.
 }
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> support(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
+BOOST_MATH_GPU_ENABLED inline const boost::math::pair<RealType, RealType> support(const inverse_chi_squared_distribution<RealType, Policy>& /*dist*/)
 {  // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), tools::max_value<RealType>()); // 0 to + infinity.
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), tools::max_value<RealType>()); // 0 to + infinity.
 }
 
 template <class RealType, class Policy>
-RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
 
-   static const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
 
    if(false == detail::check_inverse_chi_squared
      (function, df, scale, &error_result, Policy())
@@ -159,9 +159,9 @@ RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, con
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
@@ -188,13 +188,13 @@ inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& di
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& p)
 {
    using boost::math::gamma_q_inv;
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -220,13 +220,13 @@ inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    using boost::math::gamma_q_inv;
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& x = c.param;
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -251,14 +251,14 @@ inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<Re
 } // cdf(complemented
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    using boost::math::gamma_q_inv;
 
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& q = c.param;
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(function, df, &error_result, Policy()))
@@ -280,12 +280,12 @@ inline RealType quantile(const complemented2_type<inverse_chi_squared_distributi
 } // quantile(const complement
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Mean of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 2)
       return policies::raise_domain_error<RealType>(
          function,
@@ -295,11 +295,11 @@ inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& d
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Variance of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 4)
    {
       return policies::raise_domain_error<RealType>(
@@ -311,14 +311,14 @@ inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // mode is not defined in Mathematica.
   // See Discussion section http://en.wikipedia.org/wiki/Talk:Scaled-inverse-chi-square_distribution
   // for origin of the formula used below.
 
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
    if(df < 0)
       return policies::raise_domain_error<RealType>(
          function,
@@ -341,11 +341,11 @@ inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& d
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING // For ADL
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 6)
       return policies::raise_domain_error<RealType>(
          function,
@@ -356,10 +356,10 @@ inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
@@ -370,10 +370,10 @@ inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
diff --git a/include/boost/math/distributions/inverse_gamma.hpp b/include/boost/math/distributions/inverse_gamma.hpp
index 8c9e4763d..6aa798ed8 100644
--- a/include/boost/math/distributions/inverse_gamma.hpp
+++ b/include/boost/math/distributions/inverse_gamma.hpp
@@ -2,6 +2,7 @@
 
 //  Copyright Paul A. Bristow 2010.
 //  Copyright John Maddock 2010.
+//  Copyright Matt Borland 2024.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -22,21 +23,21 @@
 // http://mathworld.wolfram.com/GammaDistribution.html
 // http://en.wikipedia.org/wiki/Gamma_distribution
 
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/tuple.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/distributions/fwd.hpp>
 #include <boost/math/special_functions/gamma.hpp>
 #include <boost/math/distributions/detail/common_error_handling.hpp>
 #include <boost/math/distributions/complement.hpp>
 
-#include <utility>
-#include <cfenv>
-
 namespace boost{ namespace math
 {
 namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_shape(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_shape(
       const char* function, // inverse_gamma
       RealType shape, // shape aka alpha
       RealType* result, // to update, perhaps with NaN
@@ -57,7 +58,7 @@ inline bool check_inverse_gamma_shape(
 } //bool check_inverse_gamma_shape
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_x(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -73,7 +74,7 @@ inline bool check_inverse_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma(
+BOOST_MATH_GPU_ENABLED inline bool check_inverse_gamma(
       const char* function, // TODO swap these over, so shape is first.
       RealType scale,  // scale aka beta
       RealType shape, // shape aka alpha
@@ -92,7 +93,7 @@ public:
    using value_type = RealType;
    using policy_type = Policy;
 
-   explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
+   BOOST_MATH_GPU_ENABLED explicit inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
@@ -101,12 +102,12 @@ public:
         l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_MATH_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_MATH_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -132,27 +133,27 @@ inverse_gamma_distribution(RealType,RealType)->inverse_gamma_distribution<typena
 // Allow random variable x to be zero, treated as a special case (unlike some definitions).
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> range(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> range(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
 {  // Range of permissible values for random variable x.
    using boost::math::tools::max_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline std::pair<RealType, RealType> support(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
+BOOST_MATH_GPU_ENABLED inline boost::math::pair<RealType, RealType> support(const inverse_gamma_distribution<RealType, Policy>& /* dist */)
 {  // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
    using boost::math::tools::max_value;
    using boost::math::tools::min_value;
-   return std::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
+   return boost::math::pair<RealType, RealType>(static_cast<RealType>(0),  max_value<RealType>());
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -195,17 +196,17 @@ inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, co
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::lgamma;
 
-   static const char* function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::logpdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 
-   RealType result = -std::numeric_limits<RealType>::infinity();
+   RealType result = -boost::math::numeric_limits<RealType>::infinity();
    if(false == detail::check_inverse_gamma(function, scale, shape, &result, Policy()))
    { // distribution parameters bad.
       return result;
@@ -232,11 +233,11 @@ inline RealType logpdf(const inverse_gamma_distribution<RealType, Policy>& dist,
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -260,12 +261,12 @@ inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, co
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::gamma_q_inv;
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -287,11 +288,11 @@ inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -310,11 +311,11 @@ inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
+BOOST_MATH_GPU_ENABLED inline RealType quantile(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   constexpr auto function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -338,11 +339,11 @@ inline RealType quantile(const complemented2_type<inverse_gamma_distribution<Rea
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -365,11 +366,11 @@ inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -391,11 +392,11 @@ inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -418,11 +419,11 @@ inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -444,11 +445,11 @@ inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
+   constexpr auto function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -470,9 +471,9 @@ inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Polic
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
+BOOST_MATH_GPU_ENABLED inline RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
-  static const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
+  constexpr auto function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 
diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp
index 82d23b8d7..dcc763e37 100644
--- a/include/boost/math/tools/tuple.hpp
+++ b/include/boost/math/tools/tuple.hpp
@@ -23,6 +23,7 @@ using cuda::std::tuple;
 
 using cuda::std::make_pair;
 
+using cuda::std::tie;
 using cuda::std::get;
 
 using cuda::std::tuple_size;
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index c9a70e8a9..bae42a8d9 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -64,11 +64,46 @@ run test_extreme_value_pdf_float.cu ;
 run test_extreme_value_quan_double.cu ;
 run test_extreme_value_quan_float.cu ;
 
+run test_fisher_f_cdf_double.cu ;
+run test_fisher_f_cdf_float.cu ;
+run test_fisher_f_pdf_double.cu ;
+run test_fisher_f_pdf_float.cu ;
+run test_fisher_f_quan_double.cu ;
+run test_fisher_f_quan_float.cu ;
+
+run test_gamma_dist_cdf_double.cu ;
+run test_gamma_dist_cdf_float.cu ;
+run test_gamma_dist_pdf_double.cu ;
+run test_gamma_dist_pdf_float.cu ;
+run test_gamma_dist_quan_double.cu ;
+run test_gamma_dist_quan_float.cu ;
+
+run test_geometric_dist_cdf_double.cu ;
+run test_geometric_dist_cdf_float.cu ;
+run test_geometric_dist_pdf_double.cu ;
+run test_geometric_dist_pdf_float.cu ;
+run test_geometric_dist_quan_double.cu ;
+run test_geometric_dist_quan_float.cu ;
+
 run test_holtsmark_cdf_double.cu ;
 run test_holtsmark_cdf_float.cu ;
 run test_holtsmark_pdf_double.cu ;
 run test_holtsmark_pdf_float.cu ;
 
+run test_inverse_chi_squared_cdf_double.cu ;
+run test_inverse_chi_squared_cdf_float.cu ;
+run test_inverse_chi_squared_pdf_double.cu ;
+run test_inverse_chi_squared_pdf_float.cu ;
+run test_inverse_chi_squared_quan_double.cu ;
+run test_inverse_chi_squared_quan_float.cu ;
+
+run test_inverse_gamma_cdf_double.cu ;
+run test_inverse_gamma_cdf_float.cu ;
+run test_inverse_gamma_pdf_double.cu ;
+run test_inverse_gamma_pdf_float.cu ;
+run test_inverse_gamma_quan_double.cu ;
+run test_inverse_gamma_quan_float.cu ;
+
 run test_landau_cdf_double.cu ;
 run test_landau_cdf_float.cu ;
 run test_landau_pdf_double.cu ;
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index 1fc2746a1..1b001eb2f 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -59,6 +59,27 @@ run test_extreme_value_pdf_nvrtc_float.cpp ;
 run test_extreme_value_quan_nvrtc_double.cpp ;
 run test_extreme_value_quan_nvrtc_float.cpp ;
 
+run test_fisher_f_cdf_nvrtc_double.cpp ;
+run test_fisher_f_cdf_nvrtc_float.cpp ;
+run test_fisher_f_pdf_nvrtc_double.cpp ;
+run test_fisher_f_pdf_nvrtc_float.cpp ;
+run test_fisher_f_quan_nvrtc_double.cpp ;
+run test_fisher_f_quan_nvrtc_float.cpp ;
+
+run test_gamma_dist_cdf_nvrtc_double.cpp ;
+run test_gamma_dist_cdf_nvrtc_float.cpp ;
+run test_gamma_dist_pdf_nvrtc_double.cpp ;
+run test_gamma_dist_pdf_nvrtc_float.cpp ;
+run test_gamma_dist_quan_nvrtc_double.cpp ;
+run test_gamma_dist_quan_nvrtc_float.cpp ;
+
+run test_geometric_dist_cdf_nvrtc_double.cpp ;
+run test_geometric_dist_cdf_nvrtc_float.cpp ;
+run test_geometric_dist_pdf_nvrtc_double.cpp ;
+run test_geometric_dist_pdf_nvrtc_float.cpp ;
+run test_geometric_dist_quan_nvrtc_double.cpp ;
+run test_geometric_dist_quan_nvrtc_float.cpp ;
+
 run test_holtsmark_cdf_nvrtc_double.cpp ;
 run test_holtsmark_cdf_nvrtc_float.cpp ;
 run test_holtsmark_pdf_nvrtc_double.cpp ;
@@ -66,6 +87,20 @@ run test_holtsmark_pdf_nvrtc_float.cpp ;
 run test_holtsmark_quan_nvrtc_double.cpp ;
 run test_holtsmark_quan_nvrtc_float.cpp ;
 
+run test_inverse_chi_squared_cdf_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_cdf_nvrtc_float.cpp ;
+run test_inverse_chi_squared_pdf_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_pdf_nvrtc_float.cpp ;
+run test_inverse_chi_squared_quan_nvrtc_double.cpp ; 
+run test_inverse_chi_squared_quan_nvrtc_float.cpp ;
+
+run test_inverse_gamma_cdf_nvrtc_double.cpp ;
+run test_inverse_gamma_cdf_nvrtc_float.cpp ;
+run test_inverse_gamma_pdf_nvrtc_double.cpp ;
+run test_inverse_gamma_pdf_nvrtc_float.cpp ;
+run test_inverse_gamma_quan_nvrtc_double.cpp ;
+run test_inverse_gamma_quan_nvrtc_float.cpp ;
+
 run test_landau_cdf_nvrtc_double.cpp ;
 run test_landau_cdf_nvrtc_float.cpp ;
 run test_landau_pdf_nvrtc_double.cpp ;
diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 5d3d85cd8..bcdfb7e5d 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -17,7 +17,12 @@ run test_cauchy.cpp ;
 run test_chi_squared.cpp ;
 run test_exponential_dist.cpp ;
 run test_extreme_value.cpp ;
+run test_fisher_f.cpp ;
+run test_gamma_dist.cpp ;
+run test_geometric.cpp ;
 run test_holtsmark.cpp ;
+run test_inverse_chi_squared_distribution.cpp ;
+run test_inverse_gamma_distribution.cpp ;
 run test_landau.cpp ;
 run test_laplace.cpp ;
 run test_logistic_dist.cpp ;
diff --git a/test/test_arcsine_cdf_double.cu b/test/test_arcsine_cdf_double.cu
index d6f6f7b35..3ac9e22cd 100644
--- a/test/test_arcsine_cdf_double.cu
+++ b/test/test_arcsine_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_cdf_float.cu b/test/test_arcsine_cdf_float.cu
index 148b1dffb..cc73ce95b 100644
--- a/test/test_arcsine_cdf_float.cu
+++ b/test/test_arcsine_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_pdf_double.cu b/test/test_arcsine_pdf_double.cu
index 7a73bb34e..8f45017ba 100644
--- a/test/test_arcsine_pdf_double.cu
+++ b/test/test_arcsine_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_pdf_float.cu b/test/test_arcsine_pdf_float.cu
index 54a11253c..c236b7876 100644
--- a/test/test_arcsine_pdf_float.cu
+++ b/test/test_arcsine_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_quan_double.cu b/test/test_arcsine_quan_double.cu
index 31f6eac8a..a45737063 100644
--- a/test/test_arcsine_quan_double.cu
+++ b/test/test_arcsine_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_quan_float.cu b/test/test_arcsine_quan_float.cu
index 6decb347b..fd8cd11fc 100644
--- a/test/test_arcsine_quan_float.cu
+++ b/test/test_arcsine_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_range_support_double.cu b/test/test_arcsine_range_support_double.cu
index cec919a1a..b3fb575fa 100644
--- a/test/test_arcsine_range_support_double.cu
+++ b/test/test_arcsine_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_arcsine_range_support_float.cu b/test/test_arcsine_range_support_float.cu
index d397e0c86..d207d0598 100644
--- a/test/test_arcsine_range_support_float.cu
+++ b/test/test_arcsine_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_cdf_double.cu b/test/test_bernoulli_cdf_double.cu
index e4c21ca06..1a6dce645 100644
--- a/test/test_bernoulli_cdf_double.cu
+++ b/test/test_bernoulli_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_cdf_float.cu b/test/test_bernoulli_cdf_float.cu
index 82c0eabc0..998f24736 100644
--- a/test/test_bernoulli_cdf_float.cu
+++ b/test/test_bernoulli_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_pdf_double.cu b/test/test_bernoulli_pdf_double.cu
index 24b33c16c..147e2f340 100644
--- a/test/test_bernoulli_pdf_double.cu
+++ b/test/test_bernoulli_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_pdf_float.cu b/test/test_bernoulli_pdf_float.cu
index 08d2ca5a0..49eaea32f 100644
--- a/test/test_bernoulli_pdf_float.cu
+++ b/test/test_bernoulli_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_range_support_double.cu b/test/test_bernoulli_range_support_double.cu
index 86c77bd11..ade952fca 100644
--- a/test/test_bernoulli_range_support_double.cu
+++ b/test/test_bernoulli_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_bernoulli_range_support_float.cu b/test/test_bernoulli_range_support_float.cu
index cdcf54418..ef276b938 100644
--- a/test/test_bernoulli_range_support_float.cu
+++ b/test/test_bernoulli_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_cdf_double.cu b/test/test_beta_dist_cdf_double.cu
index fa460244a..9188f4305 100644
--- a/test/test_beta_dist_cdf_double.cu
+++ b/test/test_beta_dist_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_cdf_float.cu b/test/test_beta_dist_cdf_float.cu
index 321c84420..0278f6415 100644
--- a/test/test_beta_dist_cdf_float.cu
+++ b/test/test_beta_dist_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_pdf_double.cu b/test/test_beta_dist_pdf_double.cu
index c0ee9272a..e86cf94dd 100644
--- a/test/test_beta_dist_pdf_double.cu
+++ b/test/test_beta_dist_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_pdf_float.cu b/test/test_beta_dist_pdf_float.cu
index 75e4fa27b..97dd606f2 100644
--- a/test/test_beta_dist_pdf_float.cu
+++ b/test/test_beta_dist_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_quan_double.cu b/test/test_beta_dist_quan_double.cu
index 101526afa..a6b842e8e 100644
--- a/test/test_beta_dist_quan_double.cu
+++ b/test/test_beta_dist_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_beta_dist_quan_float.cu b/test/test_beta_dist_quan_float.cu
index 77696c639..48a860f4c 100644
--- a/test/test_beta_dist_quan_float.cu
+++ b/test/test_beta_dist_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_cdf_double.cu b/test/test_cauchy_cdf_double.cu
index dc99cbe33..526744ba1 100644
--- a/test/test_cauchy_cdf_double.cu
+++ b/test/test_cauchy_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_cdf_float.cu b/test/test_cauchy_cdf_float.cu
index dc99cbe33..526744ba1 100644
--- a/test/test_cauchy_cdf_float.cu
+++ b/test/test_cauchy_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_pdf_double.cu b/test/test_cauchy_pdf_double.cu
index 7a7fe5ba6..62398c31e 100644
--- a/test/test_cauchy_pdf_double.cu
+++ b/test/test_cauchy_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_pdf_float.cu b/test/test_cauchy_pdf_float.cu
index 5ec3b604b..aff3369b8 100644
--- a/test/test_cauchy_pdf_float.cu
+++ b/test/test_cauchy_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_quan_double.cu b/test/test_cauchy_quan_double.cu
index 21f4b4dda..0fcaaafe7 100644
--- a/test/test_cauchy_quan_double.cu
+++ b/test/test_cauchy_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_quan_float.cu b/test/test_cauchy_quan_float.cu
index b6bed1520..9c04c5b12 100644
--- a/test/test_cauchy_quan_float.cu
+++ b/test/test_cauchy_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_range_support_double.cu b/test/test_cauchy_range_support_double.cu
index 4ec792ce3..3a42c1bd3 100644
--- a/test/test_cauchy_range_support_double.cu
+++ b/test/test_cauchy_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_cauchy_range_support_float.cu b/test/test_cauchy_range_support_float.cu
index 1cdd90e40..e713736e6 100644
--- a/test/test_cauchy_range_support_float.cu
+++ b/test/test_cauchy_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_cdf_double.cu b/test/test_chi_squared_cdf_double.cu
index 1b0c34ce6..c2475883b 100644
--- a/test/test_chi_squared_cdf_double.cu
+++ b/test/test_chi_squared_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_cdf_float.cu b/test/test_chi_squared_cdf_float.cu
index 8ca99ed2e..07dce0d06 100644
--- a/test/test_chi_squared_cdf_float.cu
+++ b/test/test_chi_squared_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_pdf_double.cu b/test/test_chi_squared_pdf_double.cu
index ed45246d3..30edafd05 100644
--- a/test/test_chi_squared_pdf_double.cu
+++ b/test/test_chi_squared_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_pdf_float.cu b/test/test_chi_squared_pdf_float.cu
index 5a0f97db9..9b205182b 100644
--- a/test/test_chi_squared_pdf_float.cu
+++ b/test/test_chi_squared_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_quan_double.cu b/test/test_chi_squared_quan_double.cu
index 3b7dad972..3fae7d966 100644
--- a/test/test_chi_squared_quan_double.cu
+++ b/test/test_chi_squared_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_chi_squared_quan_float.cu b/test/test_chi_squared_quan_float.cu
index 3e779a090..7a717530e 100644
--- a/test/test_chi_squared_quan_float.cu
+++ b/test/test_chi_squared_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_cdf_double.cu b/test/test_exponential_cdf_double.cu
index 8601d1c08..e3a57e86e 100644
--- a/test/test_exponential_cdf_double.cu
+++ b/test/test_exponential_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_cdf_float.cu b/test/test_exponential_cdf_float.cu
index aa5ef9153..ed214a495 100644
--- a/test/test_exponential_cdf_float.cu
+++ b/test/test_exponential_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_pdf_double.cu b/test/test_exponential_pdf_double.cu
index 9a5615f1b..530b1023b 100644
--- a/test/test_exponential_pdf_double.cu
+++ b/test/test_exponential_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_pdf_float.cu b/test/test_exponential_pdf_float.cu
index f15ee3ea8..0801e2d0b 100644
--- a/test/test_exponential_pdf_float.cu
+++ b/test/test_exponential_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_quan_double.cu b/test/test_exponential_quan_double.cu
index ea5a5a681..f4eb4c3b1 100644
--- a/test/test_exponential_quan_double.cu
+++ b/test/test_exponential_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_quan_float.cu b/test/test_exponential_quan_float.cu
index ea5a5a681..f4eb4c3b1 100644
--- a/test/test_exponential_quan_float.cu
+++ b/test/test_exponential_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_range_support_double.cu b/test/test_exponential_range_support_double.cu
index eec3981d2..c19497ed5 100644
--- a/test/test_exponential_range_support_double.cu
+++ b/test/test_exponential_range_support_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_exponential_range_support_float.cu b/test/test_exponential_range_support_float.cu
index 00f443e52..a111090de 100644
--- a/test/test_exponential_range_support_float.cu
+++ b/test/test_exponential_range_support_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_cdf_double.cu b/test/test_extreme_value_cdf_double.cu
index 8f7f366b3..7ca000348 100644
--- a/test/test_extreme_value_cdf_double.cu
+++ b/test/test_extreme_value_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_cdf_float.cu b/test/test_extreme_value_cdf_float.cu
index d1b6cc762..bc3ead6eb 100644
--- a/test/test_extreme_value_cdf_float.cu
+++ b/test/test_extreme_value_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_pdf_double.cu b/test/test_extreme_value_pdf_double.cu
index 4cf3fc2d0..44ccc5b71 100644
--- a/test/test_extreme_value_pdf_double.cu
+++ b/test/test_extreme_value_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_pdf_float.cu b/test/test_extreme_value_pdf_float.cu
index c0c5da7ee..390622f40 100644
--- a/test/test_extreme_value_pdf_float.cu
+++ b/test/test_extreme_value_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_quan_double.cu b/test/test_extreme_value_quan_double.cu
index 703d2054f..41f2f69a6 100644
--- a/test/test_extreme_value_quan_double.cu
+++ b/test/test_extreme_value_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_extreme_value_quan_float.cu b/test/test_extreme_value_quan_float.cu
index 25d982cd0..5fe16e9a8 100644
--- a/test/test_extreme_value_quan_float.cu
+++ b/test/test_extreme_value_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_fisher_f.cpp b/test/test_fisher_f.cpp
index c18ed8ff1..f142a3327 100644
--- a/test/test_fisher_f.cpp
+++ b/test/test_fisher_f.cpp
@@ -8,9 +8,13 @@
 // (See accompanying file LICENSE_1_0.txt
 // or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
 #include <boost/math/distributions/fisher_f.hpp> // for fisher_f_distribution
 using boost::math::fisher_f_distribution;
diff --git a/test/test_fisher_f_cdf_double.cu b/test/test_fisher_f_cdf_double.cu
new file mode 100644
index 000000000..c6d6f0a94
--- /dev/null
+++ b/test/test_fisher_f_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_cdf_float.cu b/test/test_fisher_f_cdf_float.cu
new file mode 100644
index 000000000..9df1bc869
--- /dev/null
+++ b/test/test_fisher_f_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_cdf_nvrtc_double.cpp b/test/test_fisher_f_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..1eb9cb00f
--- /dev/null
+++ b/test/test_fisher_f_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_cdf_nvrtc_float.cpp b/test/test_fisher_f_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..244190cf1
--- /dev/null
+++ b/test/test_fisher_f_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_pdf_double.cu b/test/test_fisher_f_pdf_double.cu
new file mode 100644
index 000000000..77a3b655a
--- /dev/null
+++ b/test/test_fisher_f_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_pdf_float.cu b/test/test_fisher_f_pdf_float.cu
new file mode 100644
index 000000000..323edf342
--- /dev/null
+++ b/test/test_fisher_f_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_pdf_nvrtc_double.cpp b/test/test_fisher_f_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..8aa1482aa
--- /dev/null
+++ b/test/test_fisher_f_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_pdf_nvrtc_float.cpp b/test/test_fisher_f_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..e461dea9a
--- /dev/null
+++ b/test/test_fisher_f_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_quan_double.cu b/test/test_fisher_f_quan_double.cu
new file mode 100644
index 000000000..c16eb2a95
--- /dev/null
+++ b/test/test_fisher_f_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_quan_float.cu b/test/test_fisher_f_quan_float.cu
new file mode 100644
index 000000000..85cf47967
--- /dev/null
+++ b/test/test_fisher_f_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::fisher_f_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_fisher_f_quan_nvrtc_double.cpp b/test/test_fisher_f_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..16ad0cbc0
--- /dev/null
+++ b/test/test_fisher_f_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_fisher_f_quan_nvrtc_float.cpp b/test/test_fisher_f_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..377048e52
--- /dev/null
+++ b/test/test_fisher_f_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/fisher_f.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/fisher_f.hpp>
+extern "C" __global__ 
+void test_fisher_f_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_fisher_f_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_fisher_f_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_fisher_f_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::fisher_f_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist.cpp b/test/test_gamma_dist.cpp
index b7776c79c..2b1a181f3 100644
--- a/test/test_gamma_dist.cpp
+++ b/test/test_gamma_dist.cpp
@@ -15,16 +15,23 @@
 // From MathWorld--A Wolfram Web Resource.
 // http://mathworld.wolfram.com/GammaDistribution.html
 
+#ifndef SYCL_LANGUAGE_VERSION
 #include <pch.hpp> // include directory libs/math/src/tr1/ is needed.
+#endif
 
+#include <boost/math/tools/config.hpp>
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
+#endif
+
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // Boost.Test
 #include <boost/test/tools/floating_point_comparison.hpp>
 
 #include <boost/math/distributions/gamma.hpp>
     using boost::math::gamma_distribution;
-#include <boost/math/tools/test.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
 #include "test_out_of_range.hpp"
 
 #include <iostream>
diff --git a/test/test_gamma_dist_cdf_double.cu b/test/test_gamma_dist_cdf_double.cu
new file mode 100644
index 000000000..4777196aa
--- /dev/null
+++ b/test/test_gamma_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_cdf_float.cu b/test/test_gamma_dist_cdf_float.cu
new file mode 100644
index 000000000..a93aca395
--- /dev/null
+++ b/test/test_gamma_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_cdf_nvrtc_double.cpp b/test/test_gamma_dist_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..3e911f4e0
--- /dev/null
+++ b/test/test_gamma_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_cdf_nvrtc_float.cpp b/test/test_gamma_dist_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..17762d406
--- /dev/null
+++ b/test/test_gamma_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_pdf_double.cu b/test/test_gamma_dist_pdf_double.cu
new file mode 100644
index 000000000..a8411d5b6
--- /dev/null
+++ b/test/test_gamma_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_pdf_float.cu b/test/test_gamma_dist_pdf_float.cu
new file mode 100644
index 000000000..6ab3247ac
--- /dev/null
+++ b/test/test_gamma_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_pdf_nvrtc_double.cpp b/test/test_gamma_dist_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..1faae9986
--- /dev/null
+++ b/test/test_gamma_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_pdf_nvrtc_float.cpp b/test/test_gamma_dist_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..054ddbbad
--- /dev/null
+++ b/test/test_gamma_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_quan_double.cu b/test/test_gamma_dist_quan_double.cu
new file mode 100644
index 000000000..d29bf6d6b
--- /dev/null
+++ b/test/test_gamma_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_quan_float.cu b/test/test_gamma_dist_quan_float.cu
new file mode 100644
index 000000000..58aa42e90
--- /dev/null
+++ b/test/test_gamma_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::gamma_distribution<float_type>(1, 1), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_gamma_dist_quan_nvrtc_double.cpp b/test/test_gamma_dist_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..132efcd6c
--- /dev/null
+++ b/test/test_gamma_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_gamma_dist_quan_nvrtc_float.cpp b/test/test_gamma_dist_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..7749523ab
--- /dev/null
+++ b/test/test_gamma_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/gamma.hpp>
+extern "C" __global__ 
+void test_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::gamma_distribution<float_type>(1, 1), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::gamma_distribution<float_type>(1, 1), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric.cpp b/test/test_geometric.cpp
index 928a2aa0e..13a9e090b 100644
--- a/test/test_geometric.cpp
+++ b/test/test_geometric.cpp
@@ -26,9 +26,14 @@
 #  define TEST_REAL_CONCEPT
 #endif
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
 #include <boost/math/distributions/geometric.hpp> // for geometric_distribution
 using boost::math::geometric_distribution;
@@ -64,7 +69,11 @@ void test_spot( // Test a single spot value against 'known good' values.
                RealType tol,     // Test tolerance
                RealType logtol)  // Logcdf Test tolerance.
 {
-   BOOST_IF_CONSTEXPR (std::is_same<RealType, long double>::value || std::is_same<RealType, real_concept>::value)
+   BOOST_IF_CONSTEXPR (std::is_same<RealType, long double>::value 
+                       #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
+                       || std::is_same<RealType, real_concept>::value
+                       #endif
+                       )
    {
      logtol *= 100;
    }
@@ -376,7 +385,9 @@ if(std::numeric_limits<RealType>::is_specialized)
   static_cast<RealType>(9.9000000000003448e-201L), //
   100 * tolerance); // Note difference
 
-    // p nearer unity.
+  // p nearer unity.
+  // On GPU this gets flushed to 0 which has an eps difference of 3.4e+38
+  #ifndef BOOST_MATH_HAS_GPU_SUPPORT
   BOOST_CHECK_CLOSE_FRACTION( //
   pdf(geometric_distribution<RealType>(static_cast<RealType>(0.9999)),
   static_cast<RealType>(10) ),  // Number of failures, k
@@ -384,6 +395,7 @@ if(std::numeric_limits<RealType>::is_specialized)
   // static_cast<float>(1.00156406e-040)
   static_cast<RealType>(9.999e-41), // exact from 100 digit calculator.
   2e3 * tolerance); // Note bigger tolerance needed.
+  #endif
 
   // Moshier Cephes 100 digits calculator says 9.999e-41
   //0.9999*pow(1-0.9999,10)
diff --git a/test/test_geometric_dist_cdf_double.cu b/test/test_geometric_dist_cdf_double.cu
new file mode 100644
index 000000000..98b6510ad
--- /dev/null
+++ b/test/test_geometric_dist_cdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_cdf_float.cu b/test/test_geometric_dist_cdf_float.cu
new file mode 100644
index 000000000..2662ac07c
--- /dev/null
+++ b/test/test_geometric_dist_cdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_cdf_nvrtc_double.cpp b/test/test_geometric_dist_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..f8c5ed5aa
--- /dev/null
+++ b/test/test_geometric_dist_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_cdf_nvrtc_float.cpp b/test/test_geometric_dist_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..a53cd0d97
--- /dev/null
+++ b/test/test_geometric_dist_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_pdf_double.cu b/test/test_geometric_dist_pdf_double.cu
new file mode 100644
index 000000000..03d2dc007
--- /dev/null
+++ b/test/test_geometric_dist_pdf_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_pdf_float.cu b/test/test_geometric_dist_pdf_float.cu
new file mode 100644
index 000000000..1034d122b
--- /dev/null
+++ b/test/test_geometric_dist_pdf_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_pdf_nvrtc_double.cpp b/test/test_geometric_dist_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..8a6b5756e
--- /dev/null
+++ b/test/test_geometric_dist_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_pdf_nvrtc_float.cpp b/test/test_geometric_dist_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..dfb05105d
--- /dev/null
+++ b/test/test_geometric_dist_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_quan_double.cu b/test/test_geometric_dist_quan_double.cu
new file mode 100644
index 000000000..fcac938e5
--- /dev/null
+++ b/test/test_geometric_dist_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_quan_float.cu b/test/test_geometric_dist_quan_float.cu
new file mode 100644
index 000000000..074952202
--- /dev/null
+++ b/test/test_geometric_dist_quan_float.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch geometric distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 1000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/test/test_geometric_dist_quan_nvrtc_double.cpp b/test/test_geometric_dist_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..52b2e97ec
--- /dev/null
+++ b/test/test_geometric_dist_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_geometric_dist_quan_nvrtc_float.cpp b/test/test_geometric_dist_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..a83cf857e
--- /dev/null
+++ b/test/test_geometric_dist_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/geometric.hpp>
+extern "C" __global__ 
+void test_geometric_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.5), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_geometric_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_geometric_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_geometric_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::geometric_distribution<float_type>(0.5), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_holtsmark_cdf_double.cu b/test/test_holtsmark_cdf_double.cu
index 5a02b7ddb..6b1d57041 100644
--- a/test/test_holtsmark_cdf_double.cu
+++ b/test/test_holtsmark_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_cdf_float.cu b/test/test_holtsmark_cdf_float.cu
index 71ae21f73..2a3533bac 100644
--- a/test/test_holtsmark_cdf_float.cu
+++ b/test/test_holtsmark_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_pdf_double.cu b/test/test_holtsmark_pdf_double.cu
index b883515a7..a53360d20 100644
--- a/test/test_holtsmark_pdf_double.cu
+++ b/test/test_holtsmark_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_holtsmark_pdf_float.cu b/test/test_holtsmark_pdf_float.cu
index c56815973..57052803f 100644
--- a/test/test_holtsmark_pdf_float.cu
+++ b/test/test_holtsmark_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_inverse_chi_squared_cdf_double.cu b/test/test_inverse_chi_squared_cdf_double.cu
new file mode 100644
index 000000000..9703e7a3a
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_cdf_float.cu b/test/test_inverse_chi_squared_cdf_float.cu
new file mode 100644
index 000000000..bb56a4872
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..b221aedaa
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..743654c14
--- /dev/null
+++ b/test/test_inverse_chi_squared_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_distribution.cpp b/test/test_inverse_chi_squared_distribution.cpp
index a69782418..cbc9dcf19 100644
--- a/test/test_inverse_chi_squared_distribution.cpp
+++ b/test/test_inverse_chi_squared_distribution.cpp
@@ -14,11 +14,14 @@
 
 // http://www.wolframalpha.com/input/?i=inverse+chisquare+distribution
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
-//#include <boost/math/tools/test.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // for test_main
 #include <boost/test/tools/floating_point_comparison.hpp> // for BOOST_CHECK_CLOSE_FRACTION
diff --git a/test/test_inverse_chi_squared_pdf_double.cu b/test/test_inverse_chi_squared_pdf_double.cu
new file mode 100644
index 000000000..f30611749
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_pdf_float.cu b/test/test_inverse_chi_squared_pdf_float.cu
new file mode 100644
index 000000000..8a3d1c1ef
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..4608b3bd6
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..8b4db55c0
--- /dev/null
+++ b/test/test_inverse_chi_squared_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_quan_double.cu b/test/test_inverse_chi_squared_quan_double.cu
new file mode 100644
index 000000000..f9022c6a3
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_quan_float.cu b/test/test_inverse_chi_squared_quan_float.cu
new file mode 100644
index 000000000..10aa6d707
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_chi_squared distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_chi_squared_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_chi_squared_quan_nvrtc_double.cpp b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..0f8a9a5f8
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_chi_squared_quan_nvrtc_float.cpp b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..ab494a8da
--- /dev/null
+++ b/test/test_inverse_chi_squared_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+extern "C" __global__ 
+void test_inverse_chi_squared_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_chi_squared_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_chi_squared_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_chi_squared_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_chi_squared_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_cdf_double.cu b/test/test_inverse_gamma_cdf_double.cu
new file mode 100644
index 000000000..4368a2284
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_cdf_float.cu b/test/test_inverse_gamma_cdf_float.cu
new file mode 100644
index 000000000..cef2ec955
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_cdf_nvrtc_double.cpp b/test/test_inverse_gamma_cdf_nvrtc_double.cpp
new file mode 100644
index 000000000..c5a4b9878
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_cdf_nvrtc_float.cpp b/test/test_inverse_gamma_cdf_nvrtc_float.cpp
new file mode 100644
index 000000000..d76d51225
--- /dev/null
+++ b/test/test_inverse_gamma_cdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = cdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_distribution.cpp b/test/test_inverse_gamma_distribution.cpp
index 68b238fbc..436131d83 100644
--- a/test/test_inverse_gamma_distribution.cpp
+++ b/test/test_inverse_gamma_distribution.cpp
@@ -14,11 +14,14 @@
 #  pragma warning (disable : 4310) // cast truncates constant value
 #endif
 
-#include <boost/math/tools/test.hpp>
+#include <boost/math/tools/config.hpp>
+#include "../include_private/boost/math/tools/test.hpp"
+
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 #include <boost/math/concepts/real_concept.hpp> // for real_concept
 using ::boost::math::concepts::real_concept;
+#endif
 
-//#include <boost/math/tools/test.hpp>
 #define BOOST_TEST_MAIN
 #include <boost/test/unit_test.hpp> // for test_main
 #include <boost/test/tools/floating_point_comparison.hpp> // for BOOST_CHECK_CLOSE_FRACTION
diff --git a/test/test_inverse_gamma_pdf_double.cu b/test/test_inverse_gamma_pdf_double.cu
new file mode 100644
index 000000000..fa5073dbe
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_pdf_float.cu b/test/test_inverse_gamma_pdf_float.cu
new file mode 100644
index 000000000..c2d80fe8d
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_pdf_nvrtc_double.cpp b/test/test_inverse_gamma_pdf_nvrtc_double.cpp
new file mode 100644
index 000000000..db2c8c4e1
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_pdf_nvrtc_float.cpp b/test/test_inverse_gamma_pdf_nvrtc_float.cpp
new file mode 100644
index 000000000..4d552cf61
--- /dev/null
+++ b/test/test_inverse_gamma_pdf_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = pdf(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_quan_double.cu b/test/test_inverse_gamma_quan_double.cu
new file mode 100644
index 000000000..c9095d752
--- /dev/null
+++ b/test/test_inverse_gamma_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_quan_float.cu b/test/test_inverse_gamma_quan_float.cu
new file mode 100644
index 000000000..3e60feaa1
--- /dev/null
+++ b/test/test_inverse_gamma_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch inverse_gamma distribution kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::inverse_gamma_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
diff --git a/test/test_inverse_gamma_quan_nvrtc_double.cpp b/test/test_inverse_gamma_quan_nvrtc_double.cpp
new file mode 100644
index 000000000..a49600bde
--- /dev/null
+++ b/test/test_inverse_gamma_quan_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_inverse_gamma_quan_nvrtc_float.cpp b/test/test_inverse_gamma_quan_nvrtc_float.cpp
new file mode 100644
index 000000000..f71ed964a
--- /dev/null
+++ b/test/test_inverse_gamma_quan_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef float float_type;
+
+const char* cuda_kernel = R"(
+typedef float float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/distributions/inverse_gamma.hpp>
+extern "C" __global__ 
+void test_inverse_gamma_kernel(const float_type *in1, const float_type*, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::inverse_gamma_distribution<float_type>(), in1[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_inverse_gamma_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_inverse_gamma_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_inverse_gamma_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> dist(0.0f, 1.0f);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(dist(rng));
+            h_in2[i] = static_cast<float_type>(dist(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            auto res = quantile(boost::math::inverse_gamma_distribution<float_type>(), h_in1[i]);
+            
+            if (boost::math::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_landau_cdf_double.cu b/test/test_landau_cdf_double.cu
index 092fff00e..40bff707d 100644
--- a/test/test_landau_cdf_double.cu
+++ b/test/test_landau_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_cdf_float.cu b/test/test_landau_cdf_float.cu
index 143755aff..c4513c084 100644
--- a/test/test_landau_cdf_float.cu
+++ b/test/test_landau_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_pdf_double.cu b/test/test_landau_pdf_double.cu
index eea6f87ad..6ce3f5f78 100644
--- a/test/test_landau_pdf_double.cu
+++ b/test/test_landau_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_pdf_float.cu b/test/test_landau_pdf_float.cu
index a424bdd67..5818ddf8a 100644
--- a/test/test_landau_pdf_float.cu
+++ b/test/test_landau_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_quan_double.cu b/test/test_landau_quan_double.cu
index 8cdf12588..4995bd49c 100644
--- a/test/test_landau_quan_double.cu
+++ b/test/test_landau_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_landau_quan_float.cu b/test/test_landau_quan_float.cu
index 8cdf12588..4995bd49c 100644
--- a/test/test_landau_quan_float.cu
+++ b/test/test_landau_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_cdf_double.cu b/test/test_laplace_cdf_double.cu
index cddcfa2bc..ec3c83ecd 100644
--- a/test/test_laplace_cdf_double.cu
+++ b/test/test_laplace_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_cdf_float.cu b/test/test_laplace_cdf_float.cu
index 2af43f9f5..96acea2fd 100644
--- a/test/test_laplace_cdf_float.cu
+++ b/test/test_laplace_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_pdf_double.cu b/test/test_laplace_pdf_double.cu
index 2f53c0dd1..568be622b 100644
--- a/test/test_laplace_pdf_double.cu
+++ b/test/test_laplace_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_pdf_float.cu b/test/test_laplace_pdf_float.cu
index a8d673dba..cb2aa67c1 100644
--- a/test/test_laplace_pdf_float.cu
+++ b/test/test_laplace_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_quan_double.cu b/test/test_laplace_quan_double.cu
index cddcfa2bc..ec3c83ecd 100644
--- a/test/test_laplace_quan_double.cu
+++ b/test/test_laplace_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_laplace_quan_float.cu b/test/test_laplace_quan_float.cu
index 2af43f9f5..96acea2fd 100644
--- a/test/test_laplace_quan_float.cu
+++ b/test/test_laplace_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_cdf_double.cu b/test/test_logistic_cdf_double.cu
index 5dd3723c5..6b4e85025 100644
--- a/test/test_logistic_cdf_double.cu
+++ b/test/test_logistic_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_cdf_float.cu b/test/test_logistic_cdf_float.cu
index 89d05747b..75b6ab0af 100644
--- a/test/test_logistic_cdf_float.cu
+++ b/test/test_logistic_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_pdf_double.cu b/test/test_logistic_pdf_double.cu
index 39aaa1597..90232a2d6 100644
--- a/test/test_logistic_pdf_double.cu
+++ b/test/test_logistic_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_pdf_float.cu b/test/test_logistic_pdf_float.cu
index 279112b99..0a99ff9cf 100644
--- a/test/test_logistic_pdf_float.cu
+++ b/test/test_logistic_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_quan_double.cu b/test/test_logistic_quan_double.cu
index ad929d442..afe8a4c8c 100644
--- a/test/test_logistic_quan_double.cu
+++ b/test/test_logistic_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_logistic_quan_float.cu b/test/test_logistic_quan_float.cu
index 81c22ea4b..92c371062 100644
--- a/test/test_logistic_quan_float.cu
+++ b/test/test_logistic_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_cdf_double.cu b/test/test_mapairy_cdf_double.cu
index 1494181bf..7cb62a934 100644
--- a/test/test_mapairy_cdf_double.cu
+++ b/test/test_mapairy_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_cdf_float.cu b/test/test_mapairy_cdf_float.cu
index 41dd4615a..b67c0ee93 100644
--- a/test/test_mapairy_cdf_float.cu
+++ b/test/test_mapairy_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_pdf_double.cu b/test/test_mapairy_pdf_double.cu
index ad3abfbee..4ccd8b2f2 100644
--- a/test/test_mapairy_pdf_double.cu
+++ b/test/test_mapairy_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_pdf_float.cu b/test/test_mapairy_pdf_float.cu
index cabee4a2f..520ac9a68 100644
--- a/test/test_mapairy_pdf_float.cu
+++ b/test/test_mapairy_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_quan_double.cu b/test/test_mapairy_quan_double.cu
index fe6265eff..378700020 100644
--- a/test/test_mapairy_quan_double.cu
+++ b/test/test_mapairy_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_mapairy_quan_float.cu b/test/test_mapairy_quan_float.cu
index ad2f6b5eb..cd9d12007 100644
--- a/test/test_mapairy_quan_float.cu
+++ b/test/test_mapairy_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_cdf_double.cu b/test/test_saspoint5_cdf_double.cu
index 745ca2bf8..fb3e2f74c 100644
--- a/test/test_saspoint5_cdf_double.cu
+++ b/test/test_saspoint5_cdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_cdf_float.cu b/test/test_saspoint5_cdf_float.cu
index 51bc2e870..325a470bb 100644
--- a/test/test_saspoint5_cdf_float.cu
+++ b/test/test_saspoint5_cdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_pdf_double.cu b/test/test_saspoint5_pdf_double.cu
index 948a09260..5392a328b 100644
--- a/test/test_saspoint5_pdf_double.cu
+++ b/test/test_saspoint5_pdf_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_pdf_float.cu b/test/test_saspoint5_pdf_float.cu
index 4980e9070..01fbcd472 100644
--- a/test/test_saspoint5_pdf_float.cu
+++ b/test/test_saspoint5_pdf_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_quan_double.cu b/test/test_saspoint5_quan_double.cu
index 764c27899..7415f0690 100644
--- a/test/test_saspoint5_quan_double.cu
+++ b/test/test_saspoint5_quan_double.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_saspoint5_quan_float.cu b/test/test_saspoint5_quan_float.cu
index a65958fb8..d6f49084b 100644
--- a/test/test_saspoint5_quan_float.cu
+++ b/test/test_saspoint5_quan_float.cu
@@ -65,7 +65,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_cdf_double.cu b/test/test_weibull_cdf_double.cu
index 65efbe252..1b2e5cf0d 100644
--- a/test/test_weibull_cdf_double.cu
+++ b/test/test_weibull_cdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_cdf_float.cu b/test/test_weibull_cdf_float.cu
index 65c3ce1ff..76bf3a4e1 100644
--- a/test/test_weibull_cdf_float.cu
+++ b/test/test_weibull_cdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_pdf_double.cu b/test/test_weibull_pdf_double.cu
index 645df4c0a..dd48b57d6 100644
--- a/test/test_weibull_pdf_double.cu
+++ b/test/test_weibull_pdf_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_pdf_float.cu b/test/test_weibull_pdf_float.cu
index f1e6917f0..40064b1ed 100644
--- a/test/test_weibull_pdf_float.cu
+++ b/test/test_weibull_pdf_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_quan_double.cu b/test/test_weibull_quan_double.cu
index 2f0500602..9263fb536 100644
--- a/test/test_weibull_quan_double.cu
+++ b/test/test_weibull_quan_double.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
 
diff --git a/test/test_weibull_quan_float.cu b/test/test_weibull_quan_float.cu
index 3027e14dd..5dd6bd6ee 100644
--- a/test/test_weibull_quan_float.cu
+++ b/test/test_weibull_quan_float.cu
@@ -64,7 +64,7 @@ int main(void)
     }
 
     // Launch the Vector Add CUDA Kernel
-    int threadsPerBlock = 512;
+    int threadsPerBlock = 256;
     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
     std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;