diff --git a/doc/math.qbk b/doc/math.qbk index 0903a6d97..3d184ad8d 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -554,6 +554,7 @@ and as a CD ISBN 0-9504833-2-X 978-0-9504833-2-0, Classification 519.2-dc22. [mathpart vector_functionals Vector Functionals] [include vector_functionals/univariate_statistics.qbk] +[include vector_functionals/bivariate_statistics.qbk] [include vector_functionals/signal_statistics.qbk] [include vector_functionals/norms.qbk] [endmathpart] [/section:vector_functionals Vector Functionals] diff --git a/doc/vector_functionals/bivariate_statistics.qbk b/doc/vector_functionals/bivariate_statistics.qbk new file mode 100644 index 000000000..8cf4dcdc7 --- /dev/null +++ b/doc/vector_functionals/bivariate_statistics.qbk @@ -0,0 +1,54 @@ +[/ + Copyright 2018 Nick Thompson + + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:bivariate_statistics Bivariate Statistics] + +[heading Synopsis] + +`` +#include + +namespace boost{ namespace math{ namespace tools { + + template + auto population_covariance(Container const & u, Container const & v); + + template + auto means_and_population_covariance(Container const & u, Container const & v); + +}}} +`` + +[heading Description] + +This file provides functions for computing bivariate statistics. + +[heading Population Covariance] + + std::vector u{1,2,3,4,5}; + std::vector v{1,2,3,4,5}; + double cov_uv = boost::math::tools::population_covariance(u, v); + +The implementation follows [@https://doi.org/10.1109/CLUSTR.2009.5289161 Bennet et al]. +The data is not modified and must be forward iterable. +Works with real-valued inputs and does not work with complex-valued inputs. + +The algorithm used herein simultaneously generates the mean values of the input data /u/ and /v/. +For certain applications, it might be useful to get them in a single pass. +As such, we provide `means_and_population_covariance`: + + std::vector u{1,2,3,4,5}; + std::vector v{1,2,3,4,5}; + auto [mu_u, mu_v, cov_uv] = boost::math::tools::means_and_population_covariance(u, v); + +[heading References] + +* Bennett, Janine, et al. ['Numerically stable, single-pass, parallel statistics algorithms.] Cluster Computing and Workshops, 2009. CLUSTER'09. IEEE International Conference on. IEEE, 2009. + +[endsect] +[/section:bivariate_statistics Bivariate Statistics] diff --git a/include/boost/math/tools/bivariate_statistics.hpp b/include/boost/math/tools/bivariate_statistics.hpp new file mode 100644 index 000000000..4a2a3883b --- /dev/null +++ b/include/boost/math/tools/bivariate_statistics.hpp @@ -0,0 +1,53 @@ +// (C) Copyright Nick Thompson 2018. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_TOOLS_BIVARIATE_STATISTICS_HPP +#define BOOST_MATH_TOOLS_BIVARIATE_STATISTICS_HPP + +#include +#include +#include +#include +#include + + +namespace boost{ namespace math{ namespace tools { + +template +auto +means_and_population_covariance(Container const & u, Container const & v) +{ + using Real = typename Container::value_type; + using std::size; + BOOST_ASSERT_MSG(size(u) == size(v), "The size of each vector must be the same to compute covariance."); + BOOST_ASSERT_MSG(size(u) > 0, "Computing covariance requires at least one sample."); + + // See Equation III.9 of "Numerically Stable, Single-Pass, Parallel Statistics Algorithms", Bennet et al. + Real cov = 0; + Real mu_u = u[0]; + Real mu_v = v[0]; + + for(size_t i = 1; i < size(u); ++i) + { + Real u_tmp = (u[i] - mu_u)/(i+1); + Real v_tmp = v[i] - mu_v; + cov += i*u_tmp*v_tmp; + mu_u = mu_u + u_tmp; + mu_v = mu_v + v_tmp/(i+1); + } + + return std::make_tuple(mu_u, mu_v, cov/size(u)); +} + +template +auto +population_covariance(Container const & u, Container const & v) +{ + auto [mu_u, mu_v, cov] = boost::math::tools::means_and_population_covariance(u, v); + return cov; +} + +}}} +#endif diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 2bb6afd7c..b15f009fb 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -905,6 +905,7 @@ test-suite misc : [ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] [ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] ] [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr ] ] + [ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr ] ] [ run test_real_concept.cpp ../../test/build//boost_unit_test_framework ] [ run test_remez.cpp pch ../../test/build//boost_unit_test_framework ] [ run test_roots.cpp pch ../../test/build//boost_unit_test_framework ] diff --git a/test/bivariate_statistics_test.cpp b/test/bivariate_statistics_test.cpp new file mode 100644 index 000000000..924cb6849 --- /dev/null +++ b/test/bivariate_statistics_test.cpp @@ -0,0 +1,121 @@ +/* + * (C) Copyright Nick Thompson 2018. + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using boost::multiprecision::cpp_bin_float_50; +using boost::multiprecision::cpp_complex_50; + +/* + * Test checklist: + * 1) Does it work with multiprecision? + * 2) Does it work with .cbegin()/.cend() if the data is not altered? + * 3) Does it work with ublas and std::array? (Checking Eigen and Armadillo will make the CI system really unhappy.) + * 4) Does it work with std::forward_list if a forward iterator is all that is required? + * 5) Does it work with complex data if complex data is sensible? + */ + +using boost::math::tools::means_and_population_covariance; +using boost::math::tools::population_covariance; + +template +void test_covariance() +{ + std::cout << std::setprecision(std::numeric_limits::digits10+1); + Real tol = std::numeric_limits::epsilon(); + using std::abs; + + // Covariance of a single thing is zero: + std::array u1{8}; + std::array v1{17}; + auto [mu_u1, mu_v1, cov1] = means_and_population_covariance(u1, v1); + + BOOST_TEST(abs(cov1) < tol); + BOOST_TEST(abs(mu_u1 - 8) < tol); + BOOST_TEST(abs(mu_v1 - 17) < tol); + + + std::array u2{8, 4}; + std::array v2{3, 7}; + auto [mu_u2, mu_v2, cov2] = means_and_population_covariance(u2, v2); + + BOOST_TEST(abs(cov2+4) < tol); + BOOST_TEST(abs(mu_u2 - 6) < tol); + BOOST_TEST(abs(mu_v2 - 5) < tol); + + std::vector u3{1,2,3}; + std::vector v3{1,1,1}; + + auto [mu_u3, mu_v3, cov3] = means_and_population_covariance(u3,v3); + + // Since v is constant, covariance(u,v) = 0 against everything any u: + BOOST_TEST(abs(cov3) < tol); + BOOST_TEST(abs(mu_u3 - 2) < tol); + BOOST_TEST(abs(mu_v3 - 1) < tol); + // Make sure we pull the correct symbol out of means_and_populaton_covariance: + cov3 = population_covariance(u3, v3); + BOOST_TEST(abs(cov3) < tol); + + cov3 = population_covariance(v3, u3); + // Covariance is symmetric: cov(u,v) = cov(v,u) + BOOST_TEST(abs(cov3) < tol); + + // cov(u,u) = sigma(u)^2: + cov3 = population_covariance(u3, u3); + Real expected = Real(2)/Real(3); + + BOOST_TEST(abs(cov3 - expected) < tol); + + std::mt19937 gen(15); + // Can't template standard library on multiprecision, so use double and cast back: + std::uniform_real_distribution dis(-1.0, 1.0); + std::vector u(500); + std::vector v(500); + for(size_t i = 0; i < u.size(); ++i) { + u[i] = (Real) dis(gen); + v[i] = (Real) dis(gen); + } + + auto [mu_u, sigma_u_sq] = boost::math::tools::mean_and_population_variance(u); + auto [mu_v, sigma_v_sq] = boost::math::tools::mean_and_population_variance(v); + + auto [mu_u_, mu_v_, cov_uv] = means_and_population_covariance(u, v); + BOOST_TEST(abs(mu_u - mu_u_) < tol); + BOOST_TEST(abs(mu_v - mu_v_) < tol); + + // Cauchy-Schwartz inequality: + BOOST_TEST(cov_uv*cov_uv <= sigma_u_sq*sigma_v_sq); + // cov(X, X) = sigma(X)^2: + Real cov_uu = population_covariance(u, u); + BOOST_TEST(abs(cov_uu - sigma_u_sq) < tol); + Real cov_vv = population_covariance(v, v); + BOOST_TEST(abs(cov_vv - sigma_v_sq) < tol); + +} + +int main() +{ + test_covariance(); + test_covariance(); + test_covariance(); + test_covariance(); + + return boost::report_errors(); +}