diff --git a/doc/vector_functionals/univariate_statistics.qbk b/doc/vector_functionals/univariate_statistics.qbk index 5bb1e3693..d4d923f5e 100644 --- a/doc/vector_functionals/univariate_statistics.qbk +++ b/doc/vector_functionals/univariate_statistics.qbk @@ -33,6 +33,12 @@ namespace boost{ namespace math{ namespace tools { template auto median(ForwardIterator first, ForwardIterator last); + template + auto population_skewness(Container const & c); + + template + auto population_skewness(ForwardIterator first, ForwardIterator last); + template auto gini_coefficient(Container & c); @@ -100,6 +106,34 @@ Compute the median of a dataset: The calculation of the median is a thin wrapper around the C++11 [@https://en.cppreference.com/w/cpp/algorithm/nth_element `nth_element`]. Therefore, all requirements of `std::nth_element` are inherited by the median calculation. +[heading Skewness] + +Computes the skewness of a dataset: + + std::vector v{1,2,3,4,5}; + double skewness = boost::math::tools::population_skewness(v); + // skewness = 0. + +The input vector is not modified, works with integral and real data. +If the input data is integral, the output is a double precision float. + +For a dataset consisting of a constant value, we return zero as the skewness. + +The implementation follows [@https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf Pebay]. + +[heading Kurtosis] + +Computes the kurtosis of a dataset: + + std::vector v{1,2,3,4,5}; + double kurtosis = boost::math::tools::population_kurtosis(v); + // kurtosis = 17/10 + +The implementation follows [@https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf Pebay]. +The input data must be forward iterable and must consist of real or integral values. +If the input data is integral, the output is a double precision float. +Note that this is /not/ the excess kurtosis. +If you require the excess kurtosis, subtract 3 from the kurtosis. [heading Gini Coefficient] @@ -126,9 +160,7 @@ You should have /very/ good cause to pass negative values to the Gini coefficien [heading References] * Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. -* Mallat, Stephane. ['A wavelet tour of signal processing: the sparse way.] Academic press, 2008. -* Hurley, Niall, and Scott Rickard. ['Comparing measures of sparsity.] IEEE Transactions on Information Theory 55.10 (2009): 4723-4741. -* Jensen, Arne, and Anders la Cour-Harbo. ['Ripples in mathematics: the discrete wavelet transform.] Springer Science & Business Media, 2001. +* Philippe P. Pébay: ["Formulas for Robust, One-Pass Parallel Computation of Covariances and Arbitrary-Order Statistical Moments.] Technical Report SAND2008-6212, Sandia National Laboratories, September 2008. [endsect] [/section:univariate_statistics Univariate Statistics] diff --git a/include/boost/math/tools/univariate_statistics.hpp b/include/boost/math/tools/univariate_statistics.hpp index 8ec3fed84..5e0570cd9 100644 --- a/include/boost/math/tools/univariate_statistics.hpp +++ b/include/boost/math/tools/univariate_statistics.hpp @@ -154,5 +154,145 @@ inline auto gini_coefficient(RandomAccessContainer & v) return gini_coefficient(v.begin(), v.end()); } +// Follows equation 1.5 of: +// https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf +template +auto +population_skewness(ForwardIterator first, ForwardIterator last) +{ + using Real = typename std::iterator_traits::value_type; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness."); + if constexpr (std::is_integral::value) + { + double M1 = *first; + double M2 = 0; + double M3 = 0; + double n = 2; + for (auto it = first + 1; it != last; ++it) + { + double delta21 = *it - M1; + double tmp = delta21/n; + M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); + M2 = M2 + tmp*(n-1)*delta21; + M1 = M1 + tmp; + n += 1; + } + + double variance = M2/(n-1); + if (variance == 0) + { + // The limit is technically undefined, but the interpretation here is clear: + // A constant dataset has no skewness. + return double(0); + } + double skewness = M3/((n-1)*variance*sqrt(variance)); + return skewness; + } + else + { + Real M1 = *first; + Real M2 = 0; + Real M3 = 0; + Real n = 2; + for (auto it = first + 1; it != last; ++it) + { + Real delta21 = *it - M1; + Real tmp = delta21/n; + M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); + M2 = M2 + tmp*(n-1)*delta21; + M1 = M1 + tmp; + n += 1; + } + + Real variance = M2/(n-1); + if (variance == 0) + { + // The limit is technically undefined, but the interpretation here is clear: + // A constant dataset has no skewness. + return Real(0); + } + Real skewness = M3/((n-1)*variance*sqrt(variance)); + return skewness; + } +} + +template +inline auto population_skewness(Container const & v) +{ + return population_skewness(v.cbegin(), v.cend()); +} + +// Follows equation 1.6 of: +// https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf +template +auto +population_kurtosis(ForwardIterator first, ForwardIterator last) +{ + using Real = typename std::iterator_traits::value_type; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute kurtosis."); + if constexpr (std::is_integral::value) + { + double M1 = *first; + double M2 = 0; + double M3 = 0; + double M4 = 0; + double n = 2; + for (auto it = first + 1; it != last; ++it) + { + double delta21 = *it - M1; + double tmp = delta21/n; + M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3); + M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); + M2 = M2 + tmp*(n-1)*delta21; + M1 = M1 + tmp; + n += 1; + } + + double variance = M2/(n-1); + if (variance == 0) + { + return double(0); + } + double kurtosis = M4/((n-1)*variance*variance); + return kurtosis; + } + else + { + Real M1 = *first; + Real M2 = 0; + Real M3 = 0; + Real M4 = 0; + Real n = 2; + for (auto it = first + 1; it != last; ++it) + { + Real delta21 = *it - M1; + Real tmp = delta21/n; + M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3); + M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2); + M2 = M2 + tmp*(n-1)*delta21; + M1 = M1 + tmp; + n += 1; + } + + Real variance = M2/(n-1); + if (variance == 0) + { + // Again, the limit is technically undefined, but the interpretation here is clear: + // A constant dataset has no kurtosis. + return Real(0); + } + Real kurtosis = M4/((n-1)*variance*variance); + return kurtosis; + } +} + +template +inline auto population_kurtosis(Container const & v) +{ + return population_kurtosis(v.cbegin(), v.cend()); +} + + + }}} #endif diff --git a/test/univariate_statistics_test.cpp b/test/univariate_statistics_test.cpp index 955d1ad06..ded487c99 100644 --- a/test/univariate_statistics_test.cpp +++ b/test/univariate_statistics_test.cpp @@ -213,6 +213,85 @@ void test_gini_coefficient() BOOST_TEST(abs(gini) < tol); } +template +void test_integer_skewness() +{ + double tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1}; + double skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew) < tol); + + // Dataset is symmetric about the mean: + v = {1,2,3,4,5}; + skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew) < tol); + + v = {0,0,0,0,5}; + // mu = 1, sigma^2 = 4, sigma = 2, skew = 3/2 + skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew - 3.0/2.0) < tol); + +} + +template +void test_skewness() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1}; + Real skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew) < tol); + + // Dataset is symmetric about the mean: + v = {1,2,3,4,5}; + skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew) < tol); + + v = {0,0,0,0,5}; + // mu = 1, sigma^2 = 4, sigma = 2, skew = 3/2 + skew = boost::math::tools::population_skewness(v); + BOOST_TEST(abs(skew - Real(3)/Real(2)) < tol); + +} + +template +void test_kurtosis() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1}; + Real kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis) < tol); + + v = {1,2,3,4,5}; + // mu =1, sigma^2 = 2, kurtosis = 17/10 + kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis - Real(17)/Real(10)) < tol); + + v = {0,0,0,0,5}; + // mu = 1, sigma^2 = 4, sigma = 2, skew = 3/2, kurtosis = 13/4 + kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis- Real(13)/Real(4)) < tol); +} + +template +void test_integer_kurtosis() +{ + double tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1}; + double kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis) < tol); + + v = {1,2,3,4,5}; + // mu =1, sigma^2 = 2, kurtosis = 17/10 + kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis - 17.0/10.0) < tol); + + v = {0,0,0,0,5}; + // mu = 1, sigma^2 = 4, sigma = 2, skew = 3/2, kurtosis = 13/4 + kurtosis = boost::math::tools::population_kurtosis(v); + BOOST_TEST(abs(kurtosis- 13.0/4.0) < tol); +} + + int main() { test_integer_mean(); @@ -244,5 +323,19 @@ int main() test_gini_coefficient(); test_gini_coefficient(); + test_skewness(); + test_skewness(); + test_skewness(); + test_skewness(); + + test_integer_skewness(); + + test_kurtosis(); + test_kurtosis(); + test_kurtosis(); + test_kurtosis(); + + test_integer_kurtosis(); + return boost::report_errors(); }