diff --git a/doc/distributions/dist_reference.qbk b/doc/distributions/dist_reference.qbk index 3d5d82fe8..7231d39e3 100644 --- a/doc/distributions/dist_reference.qbk +++ b/doc/distributions/dist_reference.qbk @@ -37,6 +37,7 @@ [include triangular.qbk] [include uniform.qbk] [include weibull.qbk] +[include empirical_cdf.qbk] [endsect] [/section:dists Distributions] @@ -138,10 +139,3 @@ opportunity to integrate the statistical tests with this framework at some later (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt). ] - - - - - - - diff --git a/doc/distributions/empirical_cdf.qbk b/doc/distributions/empirical_cdf.qbk new file mode 100644 index 000000000..81faf76ff --- /dev/null +++ b/doc/distributions/empirical_cdf.qbk @@ -0,0 +1,71 @@ +[/ +Copyright (c) 2019 Nick Thompson +Use, modification and distribution are subject to the +Boost Software License, Version 1.0. (See accompanying file +LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +] + +[section:empirical_cdf Empirical Cumulative Distribution Function] + +[heading Synopsis] + +``` +#include + +namespace boost{ namespace math{ + +template +class empirical_cumulative_distribution_function +{ +public: + using Real = typename RandomAccessContainer::value_type; + empirical_cumulative_distribution_function(RandomAccessContainer && v); + + auto operator()(Real t) const; +}; + +}} +``` + +[heading Empirical Cumulative Distribution Function] + +The empirical cumulative distribution function is a step function constructed from observed data which converges to the true cumulative distribution function in the limit of infinite data. +This function is a basic building block of hypothesis testing workflows that attempt to answer the question "does my data come from a given distribution?" +These tests require computing quadratures over some function of the empirical CDF and the supposed CDF to create a distance measurement, and hence it is occasionally useful to construct a continuous callable from the data. + +An example usage is demonstrated below: + +``` +#include +#include +#include +using boost::math::empirical_cumulative_distribution_function; +std::random_device rd; +std::mt19937 gen{rd()}; +std::normal_distribution dis(0, 1); +size_t n = 128; +std::vector v(n); +for (size_t i = 0; i < n; ++i) { + v[i] = dis(gen); +} + +auto ecdf = empirical_cumulative_distribution_function(std::move(v)); +std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n"; +// should print approximately 0.5 . . . +``` + +The empirical distribution function operates on sorted data. +If the data are not already sorted, the constructor sorts it for you at O(Nlog(N)) cost. + +Call operator complexity is O(log(N)). + +Works with both integer and floating point types. +If the input data consists of integers, the output of the call operator is a double. Requires C++17. + +[$../graphs/empiricial_cumulative_distribution_gauss.svg] + +[$../graphs/empiricial_cumulative_distribution_uniform.svg] + + +[endsect] +[/section:empirical_cdf] diff --git a/doc/graphs/empiricial_cumulative_distribution_gauss.svg b/doc/graphs/empiricial_cumulative_distribution_gauss.svg index 730652135..f07a6b51c 100644 --- a/doc/graphs/empiricial_cumulative_distribution_gauss.svg +++ b/doc/graphs/empiricial_cumulative_distribution_gauss.svg @@ -2,7 +2,7 @@ -Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) Gaussian distribution for n = 128 samples +Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) distribution on 128 samples diff --git a/doc/graphs/empiricial_cumulative_distribution_uniform.svg b/doc/graphs/empiricial_cumulative_distribution_uniform.svg index 47d6001dd..46d3a7a15 100644 --- a/doc/graphs/empiricial_cumulative_distribution_uniform.svg +++ b/doc/graphs/empiricial_cumulative_distribution_uniform.svg @@ -2,7 +2,7 @@ -Empirical (blue) and theoretical CDF (orange) of an dice roll distribution on n = 128 samples +Empirical (blue) and theoretical CDF (orange) of the dice roll distribution on n = 128 samples diff --git a/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp b/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp index a269e130a..a880580ac 100644 --- a/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp +++ b/include/boost/math/distributions/empirical_cumulative_distribution_function.hpp @@ -6,6 +6,7 @@ #ifndef BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP #define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP #include +#include namespace boost { namespace math{ @@ -22,7 +23,8 @@ public: } auto operator()(Real x) const { - if constexpr (std::is_integral_v) { + if constexpr (std::is_integral_v) + { if (x < m_v[0]) { return double(0); } @@ -32,15 +34,16 @@ public: auto it = std::upper_bound(m_v.begin(), m_v.end(), x); return static_cast(std::distance(m_v.begin(), it))/static_cast(m_v.size()); } - else { - if (x < m_v[0]) { - return Real(0); - } - if (x >= m_v[m_v.size()-1]) { - return Real(1); - } - auto it = std::upper_bound(m_v.begin(), m_v.end(), x); - return static_cast(std::distance(m_v.begin(), it))/static_cast(m_v.size()); + else + { + if (x < m_v[0]) { + return Real(0); + } + if (x >= m_v[m_v.size()-1]) { + return Real(1); + } + auto it = std::upper_bound(m_v.begin(), m_v.end(), x); + return static_cast(std::distance(m_v.begin(), it))/static_cast(m_v.size()); } } diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index a75340d88..5583bcfa2 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -951,6 +951,7 @@ test-suite misc : [ run compile_test/catmull_rom_concept_test.cpp compile_test_main : : : [ requires cxx11_hdr_array cxx11_hdr_initializer_list ] ] [ run ooura_fourier_integral_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] + [ run empirical_cumulative_distribution_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] diff --git a/test/empirical_cumulative_distribution_test.cpp b/test/empirical_cumulative_distribution_test.cpp index ec6f8336a..632f3653b 100644 --- a/test/empirical_cumulative_distribution_test.cpp +++ b/test/empirical_cumulative_distribution_test.cpp @@ -16,18 +16,54 @@ using boost::multiprecision::float128; #endif -using boost::math::distributions::empirical_cumulative_distribution_function; +using boost::math::empirical_cumulative_distribution_function; + +template +void test_uniform_z() +{ + std::vector v{6,3,4,1,2,5}; + + auto ecdf = empirical_cumulative_distribution_function(std::move(v)); + + CHECK_ULP_CLOSE(1.0/6.0, ecdf(1), 1); + CHECK_ULP_CLOSE(2.0/6.0, ecdf(2), 1); + CHECK_ULP_CLOSE(3.0/6.0, ecdf(3), 1); + CHECK_ULP_CLOSE(4.0/6.0, ecdf(4), 1); + CHECK_ULP_CLOSE(5.0/6.0, ecdf(5), 1); + CHECK_ULP_CLOSE(6.0/6.0, ecdf(6), 1); + + // Less trivial: + + v = {6,3,4,1,1,1,2,4}; + ecdf = empirical_cumulative_distribution_function(std::move(v)); + CHECK_ULP_CLOSE(3.0/8.0, ecdf(1), 1); + CHECK_ULP_CLOSE(4.0/8.0, ecdf(2), 1); + CHECK_ULP_CLOSE(5.0/8.0, ecdf(3), 1); + CHECK_ULP_CLOSE(7.0/8.0, ecdf(4), 1); + CHECK_ULP_CLOSE(7.0/8.0, ecdf(5), 1); + CHECK_ULP_CLOSE(8.0/8.0, ecdf(6), 1); +} template void test_uniform() { + size_t n = 128; + std::vector v(n); + for (size_t i = 0; i < n; ++i) { + v[i] = Real(i+1)/Real(n); + } + auto ecdf = empirical_cumulative_distribution_function(std::move(v)); + + for (size_t i = 0; i < n; ++i) { + CHECK_ULP_CLOSE(Real(i+1)/Real(n), ecdf(Real(i+1)/Real(n)), 1); + } } int main() { - - test_uniform(); + test_uniform_z(); + test_uniform(); return boost::math::test::report_errors(); }