From aa43b5b52b143b97357e9a23722ccede33566929 Mon Sep 17 00:00:00 2001 From: Nick Thompson Date: Thu, 6 Dec 2018 20:33:05 -0700 Subject: [PATCH] Vector functionals, first pass [CI SKIP] --- doc/math.qbk | 4 + doc/vector_functionals/vector_functionals.qbk | 203 +++++++++++++++ .../boost/math/tools/vector_functionals.hpp | 245 ++++++++++++++++++ test/vector_functionals_test.cpp | 217 ++++++++++++++++ 4 files changed, 669 insertions(+) create mode 100644 doc/vector_functionals/vector_functionals.qbk create mode 100644 include/boost/math/tools/vector_functionals.hpp create mode 100644 test/vector_functionals_test.cpp diff --git a/doc/math.qbk b/doc/math.qbk index 711e88f8c..90585886f 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -552,6 +552,10 @@ and as a CD ISBN 0-9504833-2-X 978-0-9504833-2-0, Classification 519.2-dc22. [include distributions/dist_reference.qbk] [/includes all individual distribution.qbk files] [endmathpart] [/section:dist Statistical Distributions and Functions] +[mathpart vector_functionals Vector Functionals] +[include vector_functionals/vector_functionals.qbk] +[endmathpart] [/section:vector_functionals Vector Functionals] + [mathpart special Special Functions] [include sf/number_series.qbk] diff --git a/doc/vector_functionals/vector_functionals.qbk b/doc/vector_functionals/vector_functionals.qbk new file mode 100644 index 000000000..046a1bcd3 --- /dev/null +++ b/doc/vector_functionals/vector_functionals.qbk @@ -0,0 +1,203 @@ +[/ + Copyright 2017 Nick Thompson + + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:vector_functionals Vector Functionals] + +[heading Synopsis] + +`` +#include + +namespace boost{ namespace math{ namespace tools { + + template + auto mean(ForwardIterator first, ForwardIterator last); + + template + auto mean_and_variance(ForwardIterator first, ForwardIterator last); + + template + auto median(ForwardIterator first, ForwardIterator last); + + template + auto absolute_median(ForwardIterator first, ForwardIterator last); + + template + auto shannon_entropy(ForwardIterator first, ForwardIterator last); + + template + auto normalized_shannon_entropy(ForwardIterator first, ForwardIterator last); + + template + auto gini_coefficient(ForwardIterator first, ForwardIterator last); + + template + auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last); + + template + auto pq_mean(ForwardIterator first, ForwardIterator last, p, q); + + template + auto lp_norm(ForwardIterator first, ForwardIterator last, p); + + template + auto l0_norm(ForwardIterator first, ForwardIterator last); + + template + auto l1_norm(ForwardIterator first, ForwardIterator last); + + template + auto l2_norm(ForwardIterator first, ForwardIterator last); + + template + auto sup_norm(ForwardIterator first, ForwardIterator last); + + template + auto lp_distance(RandomAccessContainer const & u, RandomAccessContainer const & v, p); + + template + auto l1_distance(RandomAccessContainer const & u, RandomAccessContainer const & v); + + template + auto l2_distance(RandomAccessContainer const & u, RandomAccessContainer const & v); + + template + auto sup_distance(RandomAccessContainer const & u, RandomAccessContainer const & v); + + template + auto total_variation(ForwardIterator first, ForwardIterator last); + + template + auto lanczos_noisy_derivative(RandomAccessContainer const & v, time_step, time); + + template + auto kurtosis(ForwardIterator first, ForwardIterator last); + + template + auto skewness(ForwardIterator first, ForwardIterator last); + + template + auto covariance(RandomAccessContainer const & u, RandomAccessContainer const & v); + + template + auto simpsons_rule_quadrature(ForwardIterator first, ForwardIterator last); + + template + auto simpsons_three_eighths_quadrature(ForwardIterator first, ForwardIterator last); + + template + auto booles_rule_quadrature(ForwardIterator first, ForwardIterator last); + + template + auto inner_product(RandomAccessContainer const & u, RandomAccessContainer const & v); + + +}}} +`` + +[heading Description] + +The file `boost/math/tools/vector_functionals.hpp` is a set of facilities for computing scalar values from vectors. +We use the word "vector functional" in the [@https://ncatlab.org/nlab/show/nonlinear+functional mathematical sense], indicating a map \u2113:\u211D[super n] \u2192 \u211D, +and occasionally maps from \u2102[super n] \u2192 \u211D and \u2102[super n] \u2192 \u2102. +The set of maps provided herein attempt to cover the most commonly encountered functionals from statistics, numerical analysis, and signal processing. + +Many of these functionals have trivial naive implementations, but experienced programmers will recognize that even trivial algorithms are easy to screw up, and that numerical instabilities often lurk in corner cases. +We have attempted to do our "due diligence" to root out these problems-scouring the literature for numerically stable algorithms for even the simplest of functionals. + +/Nota bene/: Some similar functionality is provided in [@https://www.boost.org/doc/libs/1_68_0/doc/html/accumulators/user_s_guide.html Boost Accumulators Framework]. +These accumulators should be used in real-time applications; `vector_functionals.hpp` should be used when CPU vectorization is needed. +As a reminder, remember that to actually /get/ vectorization, compile with `-march=native -O3` flags. + +We now describe each functional in detail. + +Compute the mean of a container: + + std::vector v{1,2,3,4,5}; + double mu = mean(v.begin(), v.end()); + +The implementation follows [@https://doi.org/10.1137/1.9780898718027 Higham 1.6a]. +The only requirement on the input is that it must be forward iterable, so you can use Eigen vectors, ublas vectors, Armadillo vectors, or a `std::forward_list` to hold your data. + + +Compute the mean and sample variance: + + std::vector v{1,2,3,4,5}; + auto [mu, s] = mean_and_sample_variance(v.begin(), v.end()); + +The implementation follows [@https://doi.org/10.1137/1.9780898718027 Higham 1.6b]. +Note that we do not provide computation of sample variance alone; +we are unaware of any one-pass, numerically stable computation of sample variance which does not require simultaneous computation of the mean. +If the mean is not required, simply ignore it. +The input datatype must be forward iterable and the range `[first, last)` must contain at least two elements. + +Compute the median of a dataset: + + std::vector v{1,2,3,4,5}; + double m = boost::math::tools::median(v.begin(), v.end()); + +/Nota bene: The input vector is modified./ +The calculation of the median is a thin wrapper around the C++11 [@https://en.cppreference.com/w/cpp/algorithm/nth_element nth-element]. +Therefore, all requirements of `nth_element` are inherited by the median calculation. + + +Compute the sup norm of a dataset: + + std::vector v{-3, 2, 1}; + double sup = boost::math::tools::sup_norm(v.begin(), v.end()); + // sup = 3 + + std::vector> v{{0, -8}, {1,1}, {-3,2}}; + double sup = boost::math::tools::sup_norm(v.begin(), v.end()); + // sup = 8 + +Note how the calculation of \u2113[super p] norms can be performed in both real and complex arithmetic. + +Compute the Gini coefficient of a dataset: + + std::vector v{1,0,0,0}; + double gini = gini_coefficient(v.begin(), v.end()); + // gini = 1, as v[0] holds all the "wealth" + std::vector w{1,1,1,1}; + gini = gini_coefficient(w.begin(), w.end()); + // gini = 0, as all elements are now equal. + +/Nota bene: The input data is altered-in particular, it is sorted./ + +/Nota bene:/ Different authors use different conventions regarding the overall scale of the Gini coefficient. +We have chosen to follow [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard's definition], which [@https://en.wikipedia.org/wiki/Gini_coefficient Wikipedia] calls a "consistent estimator" of the population Gini coefficient. +Hurley and Rickard's definition places the Gini coefficient in the range [0,1]; Wikipedia's population Gini coefficient is in the range [0, 1 - 1/N]. + +The Gini coefficient, first used to measure wealth inequality, is also one of the best measures of the sparsity of an expansion in a basis. +A sparse expansion has most of its norm concentrated in just a few coefficients, making the connection with wealth inequality obvious. +However, for measuring sparsity, the phase of the numbers is irrelevant, so `absolute_gini_coefficient` should be used instead: + + std::vector> v{{0,1}, {0,0}, {0,0}, {0,0}}; + double abs_gini = absolute_gini_coefficient(v.begin(), v.end()); + // abs_gini = 1 + std::vector> w{{0,1}, {1,0}, {0,-1}, {-1,0}}; + double abs_gini = absolute_gini_coefficient(w.begin(), w.end()); + // abs_gini = 0 + + + +[heading Examples] + +[heading Performance] + +[heading Caveats] + +[heading References] + +* Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. +* Mallat, Stephane. ['A wavelet tour of signal processing: the sparse way.] Academic press, 2008. +* Hurley, Niall, and Scott Rickard. ['Comparing measures of sparsity.] IEEE Transactions on Information Theory 55.10 (2009): 4723-4741. + + +[endsect] +[/section:vector_functionals Vector Functionals] diff --git a/include/boost/math/tools/vector_functionals.hpp b/include/boost/math/tools/vector_functionals.hpp new file mode 100644 index 000000000..8e77e97f9 --- /dev/null +++ b/include/boost/math/tools/vector_functionals.hpp @@ -0,0 +1,245 @@ +// (C) Copyright Nick Thompson 2018. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_TOOLS_VECTOR_FUNCTIONALS_HPP +#define BOOST_MATH_TOOLS_VECTOR_FUNCTIONALS_HPP + +#include +#include +#include +#include +#include +/* + * A set of tools for computing scalar quantities associated with lists of numbers. + */ + + +namespace boost{ namespace math{ namespace tools { + +template +auto +mean(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean."); + Real mu = 0; + Real i = 1; + for(auto it = first; it != last; ++it) { + mu = mu + (*it - mu)/i; + i += 1; + } + return mu; +} + +template +auto +mean_and_population_variance(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance."); + // Higham, Accuracy and Stability, equation 1.6a and 1.6b: + Real M = *first; + Real Q = 0; + Real k = 2; + for (auto it = first + 1; it != last; ++it) + { + Real tmp = *it - M; + Q = Q + ((k-1)*tmp*tmp)/k; + M = M + tmp/k; + k += 1; + } + + return std::make_pair(M, Q/(k-1)); +} + +template +auto median(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + Real m = 0; + return m; +} + +// Mallat, "A Wavelet Tour of Signal Processing", equation 2.60: +template +auto total_variation(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + BOOST_ASSERT_MSG(first != last && std::next(first) != last, "At least two samples are required to compute the total variation."); + Real tot = 0; + auto it = first; + Real tmp = *it; + while (++it != last) + { + tot += abs(*it - tmp); + tmp = *it; + } + return tot; +} + +// Mallat, equation 10.4 uses the base-2 logarithm. +template +auto shannon_entropy(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + using std::log2; + Real entropy = 0; + for (auto it = first; it != last; ++it) + { + Real tmp = *it; + if (tmp != 0) + { + entropy += tmp*log2(tmp); + } + } + return -entropy; +} + +template +auto sup_norm(ForwardIterator first, ForwardIterator last) +{ + BOOST_ASSERT_MSG(first != last, "At least one value is required to compute the sup norm."); + typedef typename std::remove_const())>::type>::type RealOrComplex; + using std::abs; + if constexpr (boost::is_complex::value) + { + auto it = max_element(first, last, [](RealOrComplex a, RealOrComplex b) { return abs(b) > abs(a); }); + return abs(*it); + } + else + { + auto pair = minmax_element(first, last); + if (abs(*pair.first) > abs(*pair.second)) + { + return abs(*pair.first); + } + else + { + return abs(*pair.second); + } + } +} + +template +size_t l0_norm(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type RealOrComplex; + size_t count = 0; + for (auto it = first; it != last; ++it) + { + if (*it != RealOrComplex(0)) + { + ++count; + } + } + return count; +} + +template +auto lp_norm(ForwardIterator first, ForwardIterator last, typename std::remove_const())>::type>::type p) +{ + using std::pow; + using std::is_floating_point; + using std::isfinite; + typedef typename std::remove_const())>::type>::type RealOrComplex; + if constexpr (boost::is_complex::value) + { + BOOST_ASSERT_MSG(p.real() >= 0, "For p < 0, the lp norm is not a norm."); + BOOST_ASSERT_MSG(p.imag() == 0, "For imaginary p, the lp norm is not a norm."); + using std::norm; + decltype(p.real()) lp = 0; + for (auto it = first; it != last; ++it) + { + lp += pow(norm(*it), p.real()/2); + } + + auto result = pow(lp, 1/p.real()); + if (!isfinite(result)) + { + auto a = boost::math::tools::sup_norm(first, last); + decltype(p.real()) lp = 0; + for (auto it = first; it != last; ++it) + { + lp += pow(abs(*it)/a, p.real()); + } + result = a*pow(lp, 1/p.real()); + } + return result; + } + else if constexpr (is_floating_point::value) + { + BOOST_ASSERT_MSG(p >= 0, "For p < 0, the lp norm is not a norm"); + RealOrComplex lp = 0; + for (auto it = first; it != last; ++it) + { + lp += pow(abs(*it), p); + } + RealOrComplex result = pow(lp, 1/p); + if (!isfinite(result)) + { + RealOrComplex a = boost::math::tools::sup_norm(first, last); + lp = 0; + for (auto it = first; it != last; ++it) + { + lp += pow(abs(*it)/a, p); + } + result = a*pow(lp, 1/p); + } + return result; + } + else + { + BOOST_ASSERT_MSG(false, "Unable to determine if the input type is real or complex."); + } +} + +template +auto gini_coefficient(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type Real; + BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); + + std::sort(first, last); + + Real i = 1; + Real num = 0; + Real denom = 0; + for (auto it = first; it != last; ++it) { + num += *it*i; + denom += *it; + ++i; + } + return ((2*num)/denom - i)/(i-2); +} + + +template +auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last) +{ + typedef typename std::remove_const())>::type>::type RealOrComplex; + BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); + + std::sort(first, last, [](RealOrComplex a, RealOrComplex b) { return abs(b) > abs(a); }); + + + decltype(abs(*first)) i = 1; + decltype(abs(*first)) num = 0; + decltype(abs(*first)) denom = 0; + std::cout << "{"; + for (auto it = first; it != last; ++it) { + std::cout << abs(*it) << ", "; + decltype(abs(*first)) tmp = abs(*it); + num += tmp*i; + denom += tmp; + ++i; + } + std::cout << "}\n"; + return ((2*num)/denom - i)/(i-2); +} + + + +}}} +#endif diff --git a/test/vector_functionals_test.cpp b/test/vector_functionals_test.cpp new file mode 100644 index 000000000..c82037932 --- /dev/null +++ b/test/vector_functionals_test.cpp @@ -0,0 +1,217 @@ +/* + * (C) Copyright Nick Thompson 2018. + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +using boost::multiprecision::cpp_bin_float_50; +using boost::multiprecision::cpp_complex_50; + +template +void test_mean() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,2,3,4,5}; + Real mu = boost::math::tools::mean(v.begin(), v.end()); + BOOST_TEST(abs(mu - 3) < tol); + + // Can we successfully average only part of the vector? + mu = boost::math::tools::mean(v.begin(), v.begin() + 3); + BOOST_TEST(abs(mu - 2) < tol); + + // Does it work when we const qualify? + mu = boost::math::tools::mean(v.cbegin(), v.cend()); + BOOST_TEST(abs(mu - 3) < tol); + + // Does it work for std::array? + std::array u{1,2,3,4,5,6,7}; + mu = boost::math::tools::mean(u.begin(), u.end()); + BOOST_TEST(abs(mu - 4) < tol); + + // Does it work for a forward iterator? + std::forward_list l{1,2,3,4,5,6,7}; + mu = boost::math::tools::mean(l.begin(), l.end()); + BOOST_TEST(abs(mu - 4) < tol); + + // Does it work with ublas vectors? + boost::numeric::ublas::vector w(7); + for (size_t i = 0; i < w.size(); ++i) + { + w[i] = i+1; + } + mu = boost::math::tools::mean(w.cbegin(), w.cend()); + BOOST_TEST(abs(mu - 4) < tol); + +} + +template +void test_mean_and_population_variance() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,1,1,1,1,1}; + auto [mu, sigma_sq] = boost::math::tools::mean_and_population_variance(v.begin(), v.end()); + BOOST_TEST(abs(mu - 1) < tol); + BOOST_TEST(abs(sigma_sq) < tol); + + std::vector u{1}; + auto [mu1, sigma1_sq] = boost::math::tools::mean_and_population_variance(u.begin(), u.end()); + BOOST_TEST(abs(mu1 - 1) < tol); + BOOST_TEST(abs(sigma1_sq) < tol); + + std::vector w{0,1,0,1,0,1,0,1}; + auto [mu2, sigma2_sq] = boost::math::tools::mean_and_population_variance(w.begin(), w.end()); + BOOST_TEST(abs(mu2 - 1.0/2.0) < tol); + BOOST_TEST(abs(sigma2_sq - 1.0/4.0) < tol); +} + +template +void test_lp() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector> v{{1,0}, {0,0}, {0,0}}; + Real l3 = boost::math::tools::lp_norm(v.begin(), v.end(), 3); + BOOST_TEST(abs(l3 - 1) < tol); + + std::vector u{1,0,0}; + l3 = boost::math::tools::lp_norm(u.begin(), u.end(), 3); + BOOST_TEST(abs(l3 - 1) < tol); +} + +template +void test_total_variation() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,1}; + Real tv = boost::math::tools::total_variation(v.begin(), v.end()); + BOOST_TEST(tv >= 0 && abs(tv) < tol); + + v[1] = 2; + tv = boost::math::tools::total_variation(v.begin(), v.end()); + BOOST_TEST(abs(tv - 1) < tol); + + v.resize(50); + for (size_t i = 0; i < v.size(); ++i) { + v[i] = i; + } + + tv = boost::math::tools::total_variation(v.begin(), v.end()); + BOOST_TEST(abs(tv - (v.size() -1)) < tol); + + for (size_t i = 0; i < v.size(); ++i) { + v[i] = i*i; + } + + tv = boost::math::tools::total_variation(v.begin(), v.end()); + BOOST_TEST(abs(tv - (v.size() -1)*(v.size()-1)) < tol); +} + +template +void test_sup_norm() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{-2,1,0}; + Real s = boost::math::tools::sup_norm(v.begin(), v.end()); + BOOST_TEST(abs(s - 2) < tol); + + std::vector> w{{0,-8}, {1,1}, {3,2}}; + s = boost::math::tools::sup_norm(w.begin(), w.end()); + BOOST_TEST(abs(s-8) < tol); +} + +template +void test_gini_coefficient() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{1,0,0}; + Real gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini - 1) < tol); + + v[0] = 1; + v[1] = 1; + v[2] = 1; + gini = boost::math::tools::gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini) < tol); +} + +template +void test_absolute_gini_coefficient() +{ + Real tol = std::numeric_limits::epsilon(); + std::vector v{-1,0,0}; + Real gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini - 1) < tol); + + v[0] = 1; + v[1] = -1; + v[2] = 1; + gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); + BOOST_TEST(abs(gini) < tol); + + std::vector> w(128); + std::complex i{0,1}; + for(size_t k = 0; k < w.size(); ++k) + { + w[k] = exp(i*static_cast(k)/static_cast(w.size())); + } + gini = boost::math::tools::absolute_gini_coefficient(w.begin(), w.end()); + BOOST_TEST(abs(gini) < tol); + +} + +template +void test_l0_norm() +{ + std::vector v{0,0,1}; + size_t count = boost::math::tools::l0_norm(v.begin(), v.end()); + BOOST_TEST_EQ(count, 1); +} + +int main() +{ + test_mean(); + test_mean(); + test_mean(); + test_mean(); + + test_mean_and_population_variance(); + test_mean_and_population_variance(); + test_mean_and_population_variance(); + test_mean_and_population_variance(); + + test_lp(); + test_lp(); + test_lp(); + + test_total_variation(); + test_total_variation(); + test_total_variation(); + test_total_variation(); + + test_sup_norm(); + test_sup_norm(); + test_sup_norm(); + + test_gini_coefficient(); + test_gini_coefficient(); + test_gini_coefficient(); + + test_absolute_gini_coefficient(); + test_absolute_gini_coefficient(); + test_absolute_gini_coefficient(); + + test_l0_norm(); + test_l0_norm(); + test_l0_norm(); + + return boost::report_errors(); +}