From 94ceca1e43bb7d055654b0f5ecca9e8a6421af24 Mon Sep 17 00:00:00 2001 From: Nick Thompson Date: Mon, 10 Dec 2018 12:05:05 -0700 Subject: [PATCH] Split descriptive_statistics.hpp into univariate_statistics.hpp and a currently-hypothetical bivariate_statistics.hpp [CI SKIP] --- doc/math.qbk | 3 +- doc/vector_functionals/norms.qbk | 7 +- doc/vector_functionals/signal_statistics.qbk | 144 ++++++++++ ...atistics.qbk => univariate_statistics.qbk} | 115 +------- include/boost/math/tools/norms.hpp | 32 ++- ...e_statistics.hpp => signal_statistics.hpp} | 148 +---------- .../math/tools/univariate_statistics.hpp | 158 +++++++++++ test/Jamfile.v2 | 3 +- test/norms_test.cpp | 17 +- ...cs_test.cpp => signal_statistics_test.cpp} | 215 +-------------- test/univariate_statistics_test.cpp | 248 ++++++++++++++++++ 11 files changed, 604 insertions(+), 486 deletions(-) create mode 100644 doc/vector_functionals/signal_statistics.qbk rename doc/vector_functionals/{descriptive_statistics.qbk => univariate_statistics.qbk} (54%) rename include/boost/math/tools/{descriptive_statistics.hpp => signal_statistics.hpp} (55%) create mode 100644 include/boost/math/tools/univariate_statistics.hpp rename test/{descriptive_statistics_test.cpp => signal_statistics_test.cpp} (55%) create mode 100644 test/univariate_statistics_test.cpp diff --git a/doc/math.qbk b/doc/math.qbk index c5f30c6f3..0903a6d97 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -553,7 +553,8 @@ and as a CD ISBN 0-9504833-2-X 978-0-9504833-2-0, Classification 519.2-dc22. [endmathpart] [/section:dist Statistical Distributions and Functions] [mathpart vector_functionals Vector Functionals] -[include vector_functionals/descriptive_statistics.qbk] +[include vector_functionals/univariate_statistics.qbk] +[include vector_functionals/signal_statistics.qbk] [include vector_functionals/norms.qbk] [endmathpart] [/section:vector_functionals Vector Functionals] diff --git a/doc/vector_functionals/norms.qbk b/doc/vector_functionals/norms.qbk index 221980c54..553ea5676 100644 --- a/doc/vector_functionals/norms.qbk +++ b/doc/vector_functionals/norms.qbk @@ -122,7 +122,7 @@ The \u2113[super 2] norm is again a special case of the \u2113[super /p/] norm, double l1 = boost::math::tools::l2_norm(v.begin(), v.end()); // l1 = sqrt(3) -Requires a forward iterable input, does not modify input data, and works with complex numbers. +Requires a forward iterable input, does not modify input data, and works with real and complex numbers. [heading Total Variation] @@ -135,11 +135,14 @@ Requires a forward iterable input, does not modify input data, and works with co std::vector v{1,1,1}; int tv = boost::math::tools::total_variation(v); -The total variation only supports real numbers and /signed/ integers. +The total variation only supports real numbers and integers. All the constituent operations to compute the total variation are well-defined for complex numbers, but the computed result is not meaningful; a 2D total variation is more appropriate. The container must be forward iterable, and the contents are not modified. +As an aside, the total variation is not technically a norm, since /TV(v) = 0/ does not imply /v = 0/. +However, it satisfies the triangle inequality and is absolutely 1-homogeneous, so it is a seminorm, and hence is grouped with the other norms here. + [heading References] * Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. diff --git a/doc/vector_functionals/signal_statistics.qbk b/doc/vector_functionals/signal_statistics.qbk new file mode 100644 index 000000000..3e1bae7d0 --- /dev/null +++ b/doc/vector_functionals/signal_statistics.qbk @@ -0,0 +1,144 @@ +[/ + Copyright 2018 Nick Thompson + + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:signal_statistics Signal Statistics] + +[heading Synopsis] + +`` +#include + +namespace boost{ namespace math{ namespace tools { + + template + auto absolute_median(Container & c); + + template + auto absolute_median(ForwardIterator first, ForwardIterator last); + + template + auto absolute_gini_coefficient(Container & c); + + template + auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last); + + template + auto hoyer_sparsity(Container const & c); + + template + auto hoyer_sparsity(ForwardIterator first, ForwardIterator last); + + template + auto shannon_entropy(Container const & c); + + template + auto shannon_entropy(ForwardIterator first, ForwardIterator last); + + template + auto shannon_cost(Container const & c); + + template + auto shannon_cost(ForwardIterator first, ForwardIterator last); + + +}}} +`` + +[heading Description] + +The file `boost/math/tools/signal_statistics.hpp` is a set of facilities for computing quantities commonly used in signal analysis. + +Our examples use `std::vector` to hold the data, but this not required. +In general, you can store your data in an Eigen array, and Armadillo vector, `std::array`, and for many of the routines, a `std::forward_list`. +These routines are usable in float, double, long double, and Boost.Multiprecision precision, as well as their complex extensions whenever the computation is well-defined. +For certain operations (total variation, for example) integer inputs are supported. + +[heading Absolute Median] + +The absolute median is used in signal processing, where the median of the magnitude of the coefficients in some expansion are used to estimate noise variance. +See [@https://wavelet-tour.github.io/ Mallat] for details. +The absolute median supports both real and complex arithmetic, modifies its input, and requires random access iterators. + + std::vector v{-1, 1}; + double m = boost::math::tools::absolute_median(v.begin(), v.end()); + // m = 1 + +[heading Absolute Gini Coefficient] + +The Gini coefficient, first used to measure wealth inequality, is also one of the best measures of the sparsity of an expansion in a basis. +A sparse expansion has most of its norm concentrated in just a few coefficients, making the connection with wealth inequality obvious. +However, for measuring sparsity, the phase of the numbers is irrelevant, so we provide the `absolute_gini_coefficient`: + + std::vector> v{{0,1}, {0,0}, {0,0}, {0,0}}; + double abs_gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); + // now abs_gini = 1 + + std::vector> w{{0,1}, {1,0}, {0,-1}, {-1,0}}; + double abs_gini = boost::math::tools::absolute_gini_coefficient(w.begin(), w.end()); + // now abs_gini = 0 + + std::vector u{-1, 1, -1}; + double abs_gini = boost::math::tools::absolute_gini_coefficient(u.begin(), u.end()); + // now abs_gini = 0 + +Wikipedia calls our scaling a "sample Gini coefficient". +We chose this scaling because it always returns unity for a vector which has only one nonzero coefficient, +whereas the value of the population Gini coefficient of a vector with one non-zero element is dependent on the length of the input. + +If sorting the input data is too much expense for a sparsity measure (is it going to be perfect anyway?), +consider calculating the Hoyer sparsity instead. + +[heading Hoyer Sparsity] + +The Hoyer sparsity measures a normalized ratio of the \u2113[super 1] and \u2113[super 2] norms. +As the name suggests, it is used to measure sparsity in an expansion in some basis. + +The Hoyer sparsity computes ([radic]/N/ - \u2113[super 1](v)/\u2113[super 2](v))/([radic]N -1). +For details, see [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard]. + +Usage: + + std::vector v{1,0,0}; + Real hs = boost::math::tools::hoyer_sparsity(v); + // hs = 1 + std::vector v{1,-1,1}; + Real hs = boost::math::tools::hoyer_sparsity(v.begin(), v.end()); + // hs = 0 + +The container must be forward iterable and the contents are not modified. +Accepts real, complex, and integer inputs. If the input is an integral type, the output is a double precision float. + +[heading Shannon Entropy] + + std::vector v{1/2.0, 1/2.0}; + double Hs = boost::math::tools::shannon_entropy(v.begin(), v.end()); + // Hs = ln(2). + +The Shannon entropy only supports non-negative real-valued inputs, presumably for interpretational purposes in the range [0,1]-though this is not enforced. +The natural logarithm is used to compute the Shannon entropy; all other "Shannon entropies" are readily obtained by change of log base. + +[heading Shannon Cost] + + std::vector v{-1, 1,-1}; + double Ks = boost::math::tools::shannon_cost(v.begin(), v.end()); + // Ks = 0; concentration of the vector is minimized. + +The Shannon cost is a modified version of the Shannon entropy used in signal processing and data compression. +The useful properties of the Shannon cost are /K/[sub /s/](0) = 0 and /K/[sub /s/](/v/\u2295/w/) = /K/[sub /s/](v) + /K/[sub /s/](w). +See [@https://doi.org/10.1007/978-3-642-56702-5 Ripples in Mathematics] for details. + + +[heading References] + +* Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. +* Mallat, Stephane. ['A wavelet tour of signal processing: the sparse way.] Academic press, 2008. +* Hurley, Niall, and Scott Rickard. ['Comparing measures of sparsity.] IEEE Transactions on Information Theory 55.10 (2009): 4723-4741. +* Jensen, Arne, and Anders la Cour-Harbo. ['Ripples in mathematics: the discrete wavelet transform.] Springer Science & Business Media, 2001. + +[endsect] +[/section:signal_statistics Signal Statistics] diff --git a/doc/vector_functionals/descriptive_statistics.qbk b/doc/vector_functionals/univariate_statistics.qbk similarity index 54% rename from doc/vector_functionals/descriptive_statistics.qbk rename to doc/vector_functionals/univariate_statistics.qbk index 1cc307b3c..5bb1e3693 100644 --- a/doc/vector_functionals/descriptive_statistics.qbk +++ b/doc/vector_functionals/univariate_statistics.qbk @@ -1,17 +1,17 @@ [/ - Copyright 2017 Nick Thompson + Copyright 2018 Nick Thompson Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). ] -[section:descriptive_statistics Descriptive Statistics] +[section:univariate_statistics Univariate Statistics] [heading Synopsis] `` -#include +#include namespace boost{ namespace math{ namespace tools { @@ -33,55 +33,24 @@ namespace boost{ namespace math{ namespace tools { template auto median(ForwardIterator first, ForwardIterator last); - template - auto absolute_median(Container & c); - - template - auto absolute_median(ForwardIterator first, ForwardIterator last); - template auto gini_coefficient(Container & c); template auto gini_coefficient(ForwardIterator first, ForwardIterator last); - template - auto absolute_gini_coefficient(Container & c); - - template - auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last); - - template - auto hoyer_sparsity(Container const & c); - - template - auto hoyer_sparsity(ForwardIterator first, ForwardIterator last); - - template - auto shannon_entropy(Container const & c); - - template - auto shannon_entropy(ForwardIterator first, ForwardIterator last); - - template - auto shannon_cost(Container const & c); - - template - auto shannon_cost(ForwardIterator first, ForwardIterator last); - - }}} `` [heading Description] -The file `boost/math/tools/descriptive_statistics.hpp` is a set of facilities for computing scalar values from vectors. +The file `boost/math/tools/univariate_statistics.hpp` is a set of facilities for computing scalar values from vectors. Many of these functionals have trivial naive implementations, but experienced programmers will recognize that even trivial algorithms are easy to screw up, and that numerical instabilities often lurk in corner cases. We have attempted to do our "due diligence" to root out these problems-scouring the literature for numerically stable algorithms for even the simplest of functionals. /Nota bene/: Some similar functionality is provided in [@https://www.boost.org/doc/libs/1_68_0/doc/html/accumulators/user_s_guide.html Boost Accumulators Framework]. -These accumulators should be used in real-time applications; `descriptive_statistics.hpp` should be used when CPU vectorization is needed. +These accumulators should be used in real-time applications; `univariate_statistics.hpp` should be used when CPU vectorization is needed. As a reminder, remember that to actually /get/ vectorization, compile with `-march=native -O3` flags. We now describe each functional in detail. @@ -131,15 +100,6 @@ Compute the median of a dataset: The calculation of the median is a thin wrapper around the C++11 [@https://en.cppreference.com/w/cpp/algorithm/nth_element `nth_element`]. Therefore, all requirements of `std::nth_element` are inherited by the median calculation. -[heading Absolute Median] - -The absolute median is used in signal processing, where the median of the magnitude of the coefficients in some expansion are used to estimate noise variance. -See [@https://wavelet-tour.github.io/ Mallat] for details. -The absolute median supports both real and complex arithmetic, modifies its input, and requires random access iterators. - - std::vector v{-1, 1}; - double m = boost::math::tools::absolute_median(v.begin(), v.end()); - // m = 1 [heading Gini Coefficient] @@ -163,69 +123,6 @@ If you wish to convert the Boost Gini coefficient to the population Gini coeffic However, a single use case (measuring wealth inequality when some people have negative wealth) exists, so we do not throw an exception when negative values are encountered. You should have /very/ good cause to pass negative values to the Gini coefficient calculator. -The Gini coefficient, first used to measure wealth inequality, is also one of the best measures of the sparsity of an expansion in a basis. -A sparse expansion has most of its norm concentrated in just a few coefficients, making the connection with wealth inequality obvious. -However, for measuring sparsity, the phase of the numbers is irrelevant, so `absolute_gini_coefficient` should be used instead: - - std::vector> v{{0,1}, {0,0}, {0,0}, {0,0}}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); - // now abs_gini = 1 - - std::vector> w{{0,1}, {1,0}, {0,-1}, {-1,0}}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(w.begin(), w.end()); - // now abs_gini = 0 - - std::vector u{-1, 1, -1}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(u.begin(), u.end()); - // now abs_gini = 0 - -Again, Wikipedia denotes our scaling as a "sample Gini coefficient". -We chose this scaling because it always returns unity for a vector which has only one nonzero coefficient, -whereas the value of the population Gini coefficient of a vector with one non-zero element is dependent on the length of the input. - -If sorting the input data is too much expense for a sparsity measure (is it going to be perfect anyway?), -consider calculating the Hoyer sparsity instead. - -[heading Hoyer Sparsity] - -The Hoyer sparsity measures a normalized ratio of the \u2113[super 1] and \u2113[super 2] norms. -As the name suggests, it is used to measure sparsity in an expansion in some basis. - -The Hoyer sparsity computes ([radic]/N/ - \u2113[super 1](v)/\u2113[super 2](v))/([radic]N -1). -For details, see [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard]. - -Usage: - - std::vector v{1,0,0}; - Real hs = boost::math::tools::hoyer_sparsity(v); - // hs = 1 - std::vector v{1,-1,1}; - Real hs = boost::math::tools::hoyer_sparsity(v.begin(), v.end()); - // hs = 0 - -The container must be forward iterable and the contents are not modified. -Accepts real, complex, and integer inputs. If the input is an integral type, the output is a double precision float. - -[heading Shannon Entropy] - - std::vector v{1/2.0, 1/2.0}; - double Hs = boost::math::tools::shannon_entropy(v.begin(), v.end()); - // Hs = ln(2). - -The Shannon entropy only supports non-negative real-valued inputs, presumably for interpretational purposes in the range [0,1]-though this is not enforced. -The natural logarithm is used to compute the Shannon entropy; all other "Shannon entropies" are readily obtained by change of log base. - -[heading Shannon Cost] - - std::vector v{-1, 1,-1}; - double Ks = boost::math::tools::shannon_cost(v.begin(), v.end()); - // Ks = 0; concentration of the vector is minimized. - -The Shannon cost is a modified version of the Shannon entropy used in signal processing and data compression. -The useful properties of the Shannon cost are /K/[sub /s/](0) = 0 and /K/[sub /s/](/v/\u2295 /w/) = /K/[sub /s/](v) + /K/[sub /s/](w). -See [@https://doi.org/10.1007/978-3-642-56702-5 Ripples in Mathematics] for details. - - [heading References] * Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. @@ -234,4 +131,4 @@ See [@https://doi.org/10.1007/978-3-642-56702-5 Ripples in Mathematics] for deta * Jensen, Arne, and Anders la Cour-Harbo. ['Ripples in mathematics: the discrete wavelet transform.] Springer Science & Business Media, 2001. [endsect] -[/section:descriptive_statistics Descriptive Statistics] +[/section:univariate_statistics Univariate Statistics] diff --git a/include/boost/math/tools/norms.hpp b/include/boost/math/tools/norms.hpp index 8563b1570..449d2eb53 100644 --- a/include/boost/math/tools/norms.hpp +++ b/include/boost/math/tools/norms.hpp @@ -12,10 +12,6 @@ #include #include -/* - * A set of tools for computing scalar quantities associated with lists of numbers. - */ - namespace boost{ namespace math{ namespace tools { @@ -29,12 +25,32 @@ auto total_variation(ForwardIterator first, ForwardIterator last) Real tv = 0; auto it = first; Real tmp = *it; - while (++it != last) + + if constexpr (std::is_unsigned::value) { - tv += abs(*it - tmp); - tmp = *it; + while (++it != last) + { + if (*it > tmp) + { + tv += *it - tmp; + } + else + { + tv += tmp - *it; + } + tmp = *it; + } + return tv; + } + else + { + while (++it != last) + { + tv += abs(*it - tmp); + tmp = *it; + } + return tv; } - return tv; } template diff --git a/include/boost/math/tools/descriptive_statistics.hpp b/include/boost/math/tools/signal_statistics.hpp similarity index 55% rename from include/boost/math/tools/descriptive_statistics.hpp rename to include/boost/math/tools/signal_statistics.hpp index 4e625e5c5..221978fe1 100644 --- a/include/boost/math/tools/descriptive_statistics.hpp +++ b/include/boost/math/tools/signal_statistics.hpp @@ -3,8 +3,8 @@ // Boost Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#ifndef BOOST_MATH_TOOLS_DESCRIPTIVE_STATISTICS_HPP -#define BOOST_MATH_TOOLS_DESCRIPTIVE_STATISTICS_HPP +#ifndef BOOST_MATH_TOOLS_SIGNAL_STATISTICS_HPP +#define BOOST_MATH_TOOLS_SIGNAL_STATISTICS_HPP #include #include @@ -12,119 +12,9 @@ #include #include -/* - * A set of tools for computing scalar quantities associated with lists of numbers. - */ - namespace boost{ namespace math{ namespace tools { -template -auto -mean(ForwardIterator first, ForwardIterator last) -{ - using Real = typename std::iterator_traits::value_type; - BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean."); - if constexpr (std::is_integral::value) - { - double mu = 0; - double i = 1; - for(auto it = first; it != last; ++it) { - mu = mu + (*it - mu)/i; - i += 1; - } - return mu; - } - else - { - Real mu = 0; - Real i = 1; - for(auto it = first; it != last; ++it) { - mu = mu + (*it - mu)/i; - i += 1; - } - return mu; - } -} - -template -inline auto mean(Container const & v) -{ - return mean(v.cbegin(), v.cend()); -} - -template -auto -mean_and_population_variance(ForwardIterator first, ForwardIterator last) -{ - using Real = typename std::iterator_traits::value_type; - BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance."); - // Higham, Accuracy and Stability, equation 1.6a and 1.6b: - if constexpr (std::is_integral::value) - { - double M = *first; - double Q = 0; - double k = 2; - for (auto it = first + 1; it != last; ++it) - { - double tmp = *it - M; - Q = Q + ((k-1)*tmp*tmp)/k; - M = M + tmp/k; - k += 1; - } - return std::make_pair(M, Q/(k-1)); - } - else - { - Real M = *first; - Real Q = 0; - Real k = 2; - for (auto it = first + 1; it != last; ++it) - { - Real tmp = *it - M; - Q = Q + ((k-1)*tmp*tmp)/k; - M = M + tmp/k; - k += 1; - } - - return std::make_pair(M, Q/(k-1)); - } -} - -template -inline auto mean_and_population_variance(Container const & v) -{ - return mean_and_population_variance(v.cbegin(), v.cend()); -} - -template -auto median(RandomAccessIterator first, RandomAccessIterator last) -{ - size_t num_elems = std::distance(first, last); - BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined."); - if (num_elems & 1) - { - auto middle = first + (num_elems - 1)/2; - std::nth_element(first, middle, last); - return *middle; - } - else - { - auto middle = first + num_elems/2 - 1; - std::nth_element(first, middle, last); - std::nth_element(middle, middle+1, last); - return (*middle + *(middle+1))/2; - } -} - - -template -inline auto median(RandomAccessContainer & v) -{ - return median(v.begin(), v.end()); -} - - template auto absolute_median(RandomAccessIterator first, RandomAccessIterator last) { @@ -202,40 +92,6 @@ inline auto shannon_cost(Container const & v) } -template -auto gini_coefficient(ForwardIterator first, ForwardIterator last) -{ - using Real = typename std::iterator_traits::value_type; - BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); - - std::sort(first, last); - - Real i = 1; - Real num = 0; - Real denom = 0; - for (auto it = first; it != last; ++it) - { - num += *it*i; - denom += *it; - ++i; - } - - // If the l1 norm is zero, all elements are zero, so every element is the same. - if (denom == 0) - { - return Real(0); - } - - return ((2*num)/denom - i)/(i-2); -} - -template -inline auto gini_coefficient(RandomAccessContainer & v) -{ - return gini_coefficient(v.begin(), v.end()); -} - - template auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last) { diff --git a/include/boost/math/tools/univariate_statistics.hpp b/include/boost/math/tools/univariate_statistics.hpp new file mode 100644 index 000000000..8ec3fed84 --- /dev/null +++ b/include/boost/math/tools/univariate_statistics.hpp @@ -0,0 +1,158 @@ +// (C) Copyright Nick Thompson 2018. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP +#define BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP + +#include +#include +#include +#include +#include + + +namespace boost{ namespace math{ namespace tools { + +template +auto +mean(ForwardIterator first, ForwardIterator last) +{ + using Real = typename std::iterator_traits::value_type; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean."); + if constexpr (std::is_integral::value) + { + double mu = 0; + double i = 1; + for(auto it = first; it != last; ++it) { + mu = mu + (*it - mu)/i; + i += 1; + } + return mu; + } + else + { + Real mu = 0; + Real i = 1; + for(auto it = first; it != last; ++it) { + mu = mu + (*it - mu)/i; + i += 1; + } + return mu; + } +} + +template +inline auto mean(Container const & v) +{ + return mean(v.cbegin(), v.cend()); +} + +template +auto +mean_and_population_variance(ForwardIterator first, ForwardIterator last) +{ + using Real = typename std::iterator_traits::value_type; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance."); + // Higham, Accuracy and Stability, equation 1.6a and 1.6b: + if constexpr (std::is_integral::value) + { + double M = *first; + double Q = 0; + double k = 2; + for (auto it = first + 1; it != last; ++it) + { + double tmp = *it - M; + Q = Q + ((k-1)*tmp*tmp)/k; + M = M + tmp/k; + k += 1; + } + return std::make_pair(M, Q/(k-1)); + } + else + { + Real M = *first; + Real Q = 0; + Real k = 2; + for (auto it = first + 1; it != last; ++it) + { + Real tmp = *it - M; + Q = Q + ((k-1)*tmp*tmp)/k; + M = M + tmp/k; + k += 1; + } + + return std::make_pair(M, Q/(k-1)); + } +} + +template +inline auto mean_and_population_variance(Container const & v) +{ + return mean_and_population_variance(v.cbegin(), v.cend()); +} + +template +auto median(RandomAccessIterator first, RandomAccessIterator last) +{ + size_t num_elems = std::distance(first, last); + BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined."); + if (num_elems & 1) + { + auto middle = first + (num_elems - 1)/2; + std::nth_element(first, middle, last); + return *middle; + } + else + { + auto middle = first + num_elems/2 - 1; + std::nth_element(first, middle, last); + std::nth_element(middle, middle+1, last); + return (*middle + *(middle+1))/2; + } +} + + +template +inline auto median(RandomAccessContainer & v) +{ + return median(v.begin(), v.end()); +} + + +template +auto gini_coefficient(ForwardIterator first, ForwardIterator last) +{ + using Real = typename std::iterator_traits::value_type; + BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); + + std::sort(first, last); + + Real i = 1; + Real num = 0; + Real denom = 0; + for (auto it = first; it != last; ++it) + { + num += *it*i; + denom += *it; + ++i; + } + + // If the l1 norm is zero, all elements are zero, so every element is the same. + if (denom == 0) + { + return Real(0); + } + + return ((2*num)/denom - i)/(i-2); +} + +template +inline auto gini_coefficient(RandomAccessContainer & v) +{ + return gini_coefficient(v.begin(), v.end()); +} + +}}} +#endif diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index a8fa366ac..2bb6afd7c 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -902,8 +902,9 @@ test-suite misc : [ run test_constant_generate.cpp : : : release USE_CPP_FLOAT=1 off:no ] [ run test_cubic_b_spline.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_smart_ptr cxx11_defaulted_functions ] off msvc:/bigobj release ] [ run catmull_rom_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] # does not in fact require C++17 constexpr; requires C++17 std::size. - [ run descriptive_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] + [ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] [ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] + [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr ] ] [ run test_real_concept.cpp ../../test/build//boost_unit_test_framework ] [ run test_remez.cpp pch ../../test/build//boost_unit_test_framework ] [ run test_roots.cpp pch ../../test/build//boost_unit_test_framework ] diff --git a/test/norms_test.cpp b/test/norms_test.cpp index 470c86c72..af7a89d58 100644 --- a/test/norms_test.cpp +++ b/test/norms_test.cpp @@ -109,6 +109,12 @@ void test_integer_total_variation() std::array w{1,1}; tv = boost::math::tools::total_variation(w); BOOST_TEST_EQ(tv,0); + + // Work with both signed and unsigned integers? + std::array u{1, 2, 1, 2}; + tv = boost::math::tools::total_variation(u); + BOOST_TEST_EQ(tv, 3); + } template @@ -335,6 +341,11 @@ int main() test_integer_l1_norm(); + test_complex_l1_norm>(); + test_complex_l1_norm>(); + test_complex_l1_norm>(); + test_complex_l1_norm(); + test_complex_l2_norm>(); test_complex_l2_norm>(); test_complex_l2_norm>(); @@ -345,16 +356,12 @@ int main() test_l2_norm(); test_l2_norm(); - test_complex_l1_norm>(); - test_complex_l1_norm>(); - test_complex_l1_norm>(); - test_complex_l1_norm(); - test_total_variation(); test_total_variation(); test_total_variation(); test_total_variation(); + test_integer_total_variation(); test_integer_total_variation(); return boost::report_errors(); diff --git a/test/descriptive_statistics_test.cpp b/test/signal_statistics_test.cpp similarity index 55% rename from test/descriptive_statistics_test.cpp rename to test/signal_statistics_test.cpp index 929ef40ed..c232fd486 100644 --- a/test/descriptive_statistics_test.cpp +++ b/test/signal_statistics_test.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -29,162 +29,6 @@ using boost::multiprecision::cpp_complex_50; * 5) Does it work with complex data if complex data is sensible? */ -template -void test_integer_mean() -{ - double tol = std::numeric_limits::epsilon(); - std::vector v{1,2,3,4,5}; - double mu = boost::math::tools::mean(v); - BOOST_TEST(abs(mu - 3) < tol); - - // Work with std::array? - std::array w{1,2,3,4,5}; - mu = boost::math::tools::mean(w); - BOOST_TEST(abs(mu - 3) < tol); -} - -template -void test_mean() -{ - Real tol = std::numeric_limits::epsilon(); - std::vector v{1,2,3,4,5}; - Real mu = boost::math::tools::mean(v.begin(), v.end()); - BOOST_TEST(abs(mu - 3) < tol); - - // Does range call work? - mu = boost::math::tools::mean(v); - BOOST_TEST(abs(mu - 3) < tol); - - // Can we successfully average only part of the vector? - mu = boost::math::tools::mean(v.begin(), v.begin() + 3); - BOOST_TEST(abs(mu - 2) < tol); - - // Does it work when we const qualify? - mu = boost::math::tools::mean(v.cbegin(), v.cend()); - BOOST_TEST(abs(mu - 3) < tol); - - // Does it work for std::array? - std::array u{1,2,3,4,5,6,7}; - mu = boost::math::tools::mean(u.begin(), u.end()); - BOOST_TEST(abs(mu - 4) < tol); - - // Does it work for a forward iterator? - std::forward_list l{1,2,3,4,5,6,7}; - mu = boost::math::tools::mean(l.begin(), l.end()); - BOOST_TEST(abs(mu - 4) < tol); - - // Does it work with ublas vectors? - boost::numeric::ublas::vector w(7); - for (size_t i = 0; i < w.size(); ++i) - { - w[i] = i+1; - } - mu = boost::math::tools::mean(w.cbegin(), w.cend()); - BOOST_TEST(abs(mu - 4) < tol); - -} - -template -void test_complex_mean() -{ - typedef typename Complex::value_type Real; - Real tol = std::numeric_limits::epsilon(); - std::vector v{{0,1},{0,2},{0,3},{0,4},{0,5}}; - auto mu = boost::math::tools::mean(v.begin(), v.end()); - BOOST_TEST(abs(mu.imag() - 3) < tol); - BOOST_TEST(abs(mu.real()) < tol); - - // Does range work? - mu = boost::math::tools::mean(v); - BOOST_TEST(abs(mu.imag() - 3) < tol); - BOOST_TEST(abs(mu.real()) < tol); -} - -template -void test_mean_and_population_variance() -{ - Real tol = std::numeric_limits::epsilon(); - std::vector v{1,1,1,1,1,1}; - auto [mu, sigma_sq] = boost::math::tools::mean_and_population_variance(v.begin(), v.end()); - BOOST_TEST(abs(mu - 1) < tol); - BOOST_TEST(abs(sigma_sq) < tol); - - std::vector u{1}; - auto [mu1, sigma1_sq] = boost::math::tools::mean_and_population_variance(u.cbegin(), u.cend()); - BOOST_TEST(abs(mu1 - 1) < tol); - BOOST_TEST(abs(sigma1_sq) < tol); - - std::array w{0,1,0,1,0,1,0,1}; - auto [mu2, sigma2_sq] = boost::math::tools::mean_and_population_variance(w.begin(), w.end()); - BOOST_TEST(abs(mu2 - 1.0/2.0) < tol); - BOOST_TEST(abs(sigma2_sq - 1.0/4.0) < tol); - - auto [mu3, sigma3_sq] = boost::math::tools::mean_and_population_variance(w); - BOOST_TEST(abs(mu3 - 1.0/2.0) < tol); - BOOST_TEST(abs(sigma3_sq - 1.0/4.0) < tol); - -} - -template -void test_integer_mean_and_population_variance() -{ - double tol = std::numeric_limits::epsilon(); - std::vector v{1,1,1,1,1,1}; - auto [mu, sigma_sq] = boost::math::tools::mean_and_population_variance(v); - BOOST_TEST(abs(mu - 1) < tol); - BOOST_TEST(abs(sigma_sq) < tol); -} - -template -void test_median() -{ - std::mt19937 g(12); - std::vector v{1,2,3,4,5,6,7}; - - Real m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 4); - - std::shuffle(v.begin(), v.end(), g); - // Does range call work? - m = boost::math::tools::median(v); - BOOST_TEST_EQ(m, 4); - - v = {1,2,3,3,4,5}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 3); - std::shuffle(v.begin(), v.end(), g); - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 3); - - v = {1}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 1); - - v = {1,1}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 1); - - v = {2,4}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 3); - - v = {1,1,1}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 1); - - v = {1,2,3}; - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 2); - std::shuffle(v.begin(), v.end(), g); - m = boost::math::tools::median(v.begin(), v.end()); - BOOST_TEST_EQ(m, 2); - - // Does it work with std::array? - std::array w{1,2,3}; - m = boost::math::tools::median(w); - BOOST_TEST_EQ(m, 2); -} - template void test_absolute_median() { @@ -261,34 +105,6 @@ void test_complex_absolute_median() } -template -void test_gini_coefficient() -{ - Real tol = std::numeric_limits::epsilon(); - std::vector v{1,0,0}; - Real gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); - BOOST_TEST(abs(gini - 1) < tol); - - gini = boost::math::tools::gini_coefficient(v); - BOOST_TEST(abs(gini - 1) < tol); - - v[0] = 1; - v[1] = 1; - v[2] = 1; - gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); - BOOST_TEST(abs(gini) < tol); - - v[0] = 0; - v[1] = 0; - v[2] = 0; - gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); - BOOST_TEST(abs(gini) < tol); - - std::array w{0,0,0}; - gini = boost::math::tools::gini_coefficient(w); - BOOST_TEST(abs(gini) < tol); -} - template void test_hoyer_sparsity() { @@ -411,30 +227,6 @@ void test_shannon_entropy() int main() { - test_integer_mean(); - test_integer_mean(); - test_integer_mean(); - - test_mean(); - test_mean(); - test_mean(); - test_mean(); - - test_complex_mean>(); - test_complex_mean(); - - test_mean_and_population_variance(); - test_mean_and_population_variance(); - test_mean_and_population_variance(); - test_mean_and_population_variance(); - - test_integer_mean_and_population_variance(); - - test_median(); - test_median(); - test_median(); - test_median(); - test_absolute_median(); test_absolute_median(); test_absolute_median(); @@ -445,11 +237,6 @@ int main() test_complex_absolute_median>(); test_complex_absolute_median(); - test_gini_coefficient(); - test_gini_coefficient(); - test_gini_coefficient(); - test_gini_coefficient(); - test_absolute_gini_coefficient(); test_absolute_gini_coefficient(); test_absolute_gini_coefficient(); diff --git a/test/univariate_statistics_test.cpp b/test/univariate_statistics_test.cpp new file mode 100644 index 000000000..955d1ad06 --- /dev/null +++ b/test/univariate_statistics_test.cpp @@ -0,0 +1,248 @@ +/* + * (C) Copyright Nick Thompson 2018. + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using boost::multiprecision::cpp_bin_float_50; +using boost::multiprecision::cpp_complex_50; + +/* + * Test checklist: + * 1) Does it work with multiprecision? + * 2) Does it work with .cbegin()/.cend() if the data is not altered? + * 3) Does it work with ublas and std::array? (Checking Eigen and Armadillo will make the CI system really unhappy.) + * 4) Does it work with std::forward_list if a forward iterator is all that is required? + * 5) Does it work with complex data if complex data is sensible? + */ + +template +void test_integer_mean() +{ + double tol = std::numeric_limits::epsilon(); + std::vector v{1,2,3,4,5}; + double mu = boost::math::tools::mean(v); + BOOST_TEST(abs(mu - 3) < tol); + + // Work with std::array? + std::array w{1,2,3,4,5}; + mu = boost::math::tools::mean(w); + BOOST_TEST(abs(mu - 3) < tol); +} + +template +void test_mean() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,2,3,4,5}; + Real mu = boost::math::tools::mean(v.begin(), v.end()); + BOOST_TEST(abs(mu - 3) < tol); + + // Does range call work? + mu = boost::math::tools::mean(v); + BOOST_TEST(abs(mu - 3) < tol); + + // Can we successfully average only part of the vector? + mu = boost::math::tools::mean(v.begin(), v.begin() + 3); + BOOST_TEST(abs(mu - 2) < tol); + + // Does it work when we const qualify? + mu = boost::math::tools::mean(v.cbegin(), v.cend()); + BOOST_TEST(abs(mu - 3) < tol); + + // Does it work for std::array? + std::array u{1,2,3,4,5,6,7}; + mu = boost::math::tools::mean(u.begin(), u.end()); + BOOST_TEST(abs(mu - 4) < tol); + + // Does it work for a forward iterator? + std::forward_list l{1,2,3,4,5,6,7}; + mu = boost::math::tools::mean(l.begin(), l.end()); + BOOST_TEST(abs(mu - 4) < tol); + + // Does it work with ublas vectors? + boost::numeric::ublas::vector w(7); + for (size_t i = 0; i < w.size(); ++i) + { + w[i] = i+1; + } + mu = boost::math::tools::mean(w.cbegin(), w.cend()); + BOOST_TEST(abs(mu - 4) < tol); + +} + +template +void test_complex_mean() +{ + typedef typename Complex::value_type Real; + Real tol = std::numeric_limits::epsilon(); + std::vector v{{0,1},{0,2},{0,3},{0,4},{0,5}}; + auto mu = boost::math::tools::mean(v.begin(), v.end()); + BOOST_TEST(abs(mu.imag() - 3) < tol); + BOOST_TEST(abs(mu.real()) < tol); + + // Does range work? + mu = boost::math::tools::mean(v); + BOOST_TEST(abs(mu.imag() - 3) < tol); + BOOST_TEST(abs(mu.real()) < tol); +} + +template +void test_mean_and_population_variance() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1,1,1,1}; + auto [mu, sigma_sq] = boost::math::tools::mean_and_population_variance(v.begin(), v.end()); + BOOST_TEST(abs(mu - 1) < tol); + BOOST_TEST(abs(sigma_sq) < tol); + + std::vector u{1}; + auto [mu1, sigma1_sq] = boost::math::tools::mean_and_population_variance(u.cbegin(), u.cend()); + BOOST_TEST(abs(mu1 - 1) < tol); + BOOST_TEST(abs(sigma1_sq) < tol); + + std::array w{0,1,0,1,0,1,0,1}; + auto [mu2, sigma2_sq] = boost::math::tools::mean_and_population_variance(w.begin(), w.end()); + BOOST_TEST(abs(mu2 - 1.0/2.0) < tol); + BOOST_TEST(abs(sigma2_sq - 1.0/4.0) < tol); + + auto [mu3, sigma3_sq] = boost::math::tools::mean_and_population_variance(w); + BOOST_TEST(abs(mu3 - 1.0/2.0) < tol); + BOOST_TEST(abs(sigma3_sq - 1.0/4.0) < tol); + +} + +template +void test_integer_mean_and_population_variance() +{ + double tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1,1,1,1}; + auto [mu, sigma_sq] = boost::math::tools::mean_and_population_variance(v); + BOOST_TEST(abs(mu - 1) < tol); + BOOST_TEST(abs(sigma_sq) < tol); +} + +template +void test_median() +{ + std::mt19937 g(12); + std::vector v{1,2,3,4,5,6,7}; + + Real m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 4); + + std::shuffle(v.begin(), v.end(), g); + // Does range call work? + m = boost::math::tools::median(v); + BOOST_TEST_EQ(m, 4); + + v = {1,2,3,3,4,5}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 3); + std::shuffle(v.begin(), v.end(), g); + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 3); + + v = {1}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 1); + + v = {1,1}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 1); + + v = {2,4}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 3); + + v = {1,1,1}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 1); + + v = {1,2,3}; + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 2); + std::shuffle(v.begin(), v.end(), g); + m = boost::math::tools::median(v.begin(), v.end()); + BOOST_TEST_EQ(m, 2); + + // Does it work with std::array? + std::array w{1,2,3}; + m = boost::math::tools::median(w); + BOOST_TEST_EQ(m, 2); +} + +template +void test_gini_coefficient() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,0,0}; + Real gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini - 1) < tol); + + gini = boost::math::tools::gini_coefficient(v); + BOOST_TEST(abs(gini - 1) < tol); + + v[0] = 1; + v[1] = 1; + v[2] = 1; + gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini) < tol); + + v[0] = 0; + v[1] = 0; + v[2] = 0; + gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini) < tol); + + std::array w{0,0,0}; + gini = boost::math::tools::gini_coefficient(w); + BOOST_TEST(abs(gini) < tol); +} + +int main() +{ + test_integer_mean(); + test_integer_mean(); + test_integer_mean(); + + test_mean(); + test_mean(); + test_mean(); + test_mean(); + + test_complex_mean>(); + test_complex_mean(); + + test_mean_and_population_variance(); + test_mean_and_population_variance(); + test_mean_and_population_variance(); + test_mean_and_population_variance(); + + test_integer_mean_and_population_variance(); + + test_median(); + test_median(); + test_median(); + test_median(); + + test_gini_coefficient(); + test_gini_coefficient(); + test_gini_coefficient(); + test_gini_coefficient(); + + return boost::report_errors(); +}