diff --git a/test/compile_test/instantiate.hpp b/test/compile_test/instantiate.hpp
index 1d186aff7..96c896486 100644
--- a/test/compile_test/instantiate.hpp
+++ b/test/compile_test/instantiate.hpp
@@ -264,6 +264,7 @@ void instantiate(RealType)
    boost::math::gamma_p(v1, v2);
    boost::math::gamma_q(v1, v2);
    boost::math::lgamma_q(v1, v2);
+   boost::math::lgamma_p(v1, v2);
    boost::math::gamma_p_inv(v1, v2);
    boost::math::gamma_q_inv(v1, v2);
    boost::math::gamma_p_inva(v1, v2);
@@ -544,6 +545,7 @@ void instantiate(RealType)
    boost::math::gamma_p(v1 * 1, v2 + 0);
    boost::math::gamma_q(v1 * 1, v2 + 0);
    boost::math::lgamma_q(v1 * 1, v2 + 0);
+   boost::math::lgamma_p(v1 * 1, v2 + 0);
    boost::math::gamma_p_inv(v1 * 1, v2 + 0);
    boost::math::gamma_q_inv(v1 * 1, v2 + 0);
    boost::math::gamma_p_inva(v1 * 1, v2 + 0);
@@ -796,6 +798,7 @@ void instantiate(RealType)
    boost::math::gamma_p(v1, v2, pol);
    boost::math::gamma_q(v1, v2, pol);
    boost::math::lgamma_q(v1, v2, pol);
+   boost::math::lgamma_p(v1, v2, pol);
    boost::math::gamma_p_inv(v1, v2, pol);
    boost::math::gamma_q_inv(v1, v2, pol);
    boost::math::gamma_p_inva(v1, v2, pol);
@@ -1074,6 +1077,7 @@ void instantiate(RealType)
    test::gamma_p(v1, v2);
    test::gamma_q(v1, v2);
    test::lgamma_q(v1, v2);
+   test::lgamma_p(v1, v2);
    test::gamma_p_inv(v1, v2);
    test::gamma_q_inv(v1, v2);
    test::gamma_p_inva(v1, v2);
@@ -1356,6 +1360,7 @@ void instantiate_mixed(RealType)
    boost::math::gamma_p(fr, lr);
    boost::math::gamma_q(i, s);
    boost::math::lgamma_q(i, s);
+   boost::math::lgamma_p(i, s);
    boost::math::gamma_q(fr, lr);
    boost::math::gamma_p_inv(i, fr);
    boost::math::gamma_q_inv(s, fr);
@@ -1572,6 +1577,7 @@ void instantiate_mixed(RealType)
    boost::math::gamma_p(fr, lr, pol);
    boost::math::gamma_q(i, s, pol);
    boost::math::lgamma_q(i, s, pol);
+   boost::math::lgamma_p(i, s, pol);
    boost::math::gamma_q(fr, lr, pol);
    boost::math::gamma_p_inv(i, fr, pol);
    boost::math::gamma_q_inv(s, fr, pol);
@@ -1784,8 +1790,10 @@ void instantiate_mixed(RealType)
    test::gamma_p(fr, lr);
    test::gamma_q(i, s);
    test::lgamma_q(i, s);
+   test::lgamma_p(i, s);
    test::gamma_q(fr, lr);
    test::lgamma_q(fr, lr);
+   test::lgamma_p(fr, lr);
    test::gamma_p_inv(i, fr);
    test::gamma_q_inv(s, fr);
    test::gamma_p_inva(i, lr);
diff --git a/test/compile_test/sf_gamma_incl_test.cpp b/test/compile_test/sf_gamma_incl_test.cpp
index 74ab85b2d..85045c857 100644
--- a/test/compile_test/sf_gamma_incl_test.cpp
+++ b/test/compile_test/sf_gamma_incl_test.cpp
@@ -45,6 +45,12 @@ void compile_and_link_test()
    check_result<long double>(boost::math::lgamma_q<long double>(l, l));
 #endif
 
+check_result<float>(boost::math::lgamma_p<float>(f, f));
+check_result<double>(boost::math::lgamma_p<double>(d, d));
+#ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
+   check_result<long double>(boost::math::lgamma_p<long double>(l, l));
+#endif
+
    check_result<float>(boost::math::gamma_p_inv<float>(f, f));
    check_result<double>(boost::math::gamma_p_inv<double>(d, d));
 #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 4082057fa..17ce1647b 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -371,6 +371,8 @@ run test_gamma_p_inv_double.cu ;
 run test_gamma_p_inv_float.cu ;
 run test_lgamma_q_double.cu ;
 run test_lgamma_q_float.cu ;
+run test_lgamma_p_double.cu ;
+run test_lgamma_p_float.cu ;
 
 run test_log1p_double.cu ;
 run test_log1p_float.cu ;
diff --git a/test/test_lgamma_p_double.cu b/test/test_lgamma_p_double.cu
new file mode 100644
index 000000000..5638e01a2
--- /dev/null
+++ b/test/test_lgamma_p_double.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::lgamma_p(in[i], in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma_p(input_vector[i], input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_lgamma_p_float.cu b/test/test_lgamma_p_float.cu
new file mode 100644
index 000000000..2c0dd91df
--- /dev/null
+++ b/test/test_lgamma_p_float.cu
@@ -0,0 +1,102 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::lgamma_p(in[i], in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma_p(input_vector[i], input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}