2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Empirical Cumulative Distribution function

This commit is contained in:
Nick
2019-09-24 11:00:07 -04:00
parent 72a66b109a
commit ee555a0738
7 changed files with 127 additions and 22 deletions

View File

@@ -37,6 +37,7 @@
[include triangular.qbk] [include triangular.qbk]
[include uniform.qbk] [include uniform.qbk]
[include weibull.qbk] [include weibull.qbk]
[include empirical_cdf.qbk]
[endsect] [/section:dists Distributions] [endsect] [/section:dists Distributions]
@@ -138,10 +139,3 @@ opportunity to integrate the statistical tests with this framework at some later
(See accompanying file LICENSE_1_0.txt or copy at (See accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt). http://www.boost.org/LICENSE_1_0.txt).
] ]

View File

@@ -0,0 +1,71 @@
[/
Copyright (c) 2019 Nick Thompson
Use, modification and distribution are subject to the
Boost Software License, Version 1.0. (See accompanying file
LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
]
[section:empirical_cdf Empirical Cumulative Distribution Function]
[heading Synopsis]
```
#include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
namespace boost{ namespace math{
template <class RandomAccessContainer>
class empirical_cumulative_distribution_function
{
public:
using Real = typename RandomAccessContainer::value_type;
empirical_cumulative_distribution_function(RandomAccessContainer && v);
auto operator()(Real t) const;
};
}}
```
[heading Empirical Cumulative Distribution Function]
The empirical cumulative distribution function is a step function constructed from observed data which converges to the true cumulative distribution function in the limit of infinite data.
This function is a basic building block of hypothesis testing workflows that attempt to answer the question "does my data come from a given distribution?"
These tests require computing quadratures over some function of the empirical CDF and the supposed CDF to create a distance measurement, and hence it is occasionally useful to construct a continuous callable from the data.
An example usage is demonstrated below:
```
#include <vector>
#include <random>
#include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
using boost::math::empirical_cumulative_distribution_function;
std::random_device rd;
std::mt19937 gen{rd()};
std::normal_distribution<double> dis(0, 1);
size_t n = 128;
std::vector<double> v(n);
for (size_t i = 0; i < n; ++i) {
v[i] = dis(gen);
}
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n";
// should print approximately 0.5 . . .
```
The empirical distribution function operates on sorted data.
If the data are not already sorted, the constructor sorts it for you at O(Nlog(N)) cost.
Call operator complexity is O(log(N)).
Works with both integer and floating point types.
If the input data consists of integers, the output of the call operator is a double. Requires C++17.
[$../graphs/empiricial_cumulative_distribution_gauss.svg]
[$../graphs/empiricial_cumulative_distribution_uniform.svg]
[endsect]
[/section:empirical_cdf]

View File

@@ -2,7 +2,7 @@
<svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'> <svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'>
<style>svg { background-color: black; } <style>svg { background-color: black; }
</style> </style>
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) Gaussian distribution for n = 128 samples</text> <text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) distribution on 128 samples</text>
<g transform='translate(25, 40)'> <g transform='translate(25, 40)'>
<line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' /> <line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' />
<line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' /> <line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' />

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 29 KiB

View File

@@ -2,7 +2,7 @@
<svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'> <svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'>
<style>svg { background-color: black; } <style>svg { background-color: black; }
</style> </style>
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and theoretical CDF (orange) of an dice roll distribution on n = 128 samples</text> <text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and theoretical CDF (orange) of the dice roll distribution on n = 128 samples</text>
<g transform='translate(25, 40)'> <g transform='translate(25, 40)'>
<line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' /> <line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' />
<line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' /> <line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' />

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 29 KiB

View File

@@ -6,6 +6,7 @@
#ifndef BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP #ifndef BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP
#define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP #define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP
#include <algorithm> #include <algorithm>
#include <iterator>
namespace boost { namespace math{ namespace boost { namespace math{
@@ -22,7 +23,8 @@ public:
} }
auto operator()(Real x) const { auto operator()(Real x) const {
if constexpr (std::is_integral_v<Real>) { if constexpr (std::is_integral_v<Real>)
{
if (x < m_v[0]) { if (x < m_v[0]) {
return double(0); return double(0);
} }
@@ -32,15 +34,16 @@ public:
auto it = std::upper_bound(m_v.begin(), m_v.end(), x); auto it = std::upper_bound(m_v.begin(), m_v.end(), x);
return static_cast<double>(std::distance(m_v.begin(), it))/static_cast<double>(m_v.size()); return static_cast<double>(std::distance(m_v.begin(), it))/static_cast<double>(m_v.size());
} }
else { else
if (x < m_v[0]) { {
return Real(0); if (x < m_v[0]) {
} return Real(0);
if (x >= m_v[m_v.size()-1]) { }
return Real(1); if (x >= m_v[m_v.size()-1]) {
} return Real(1);
auto it = std::upper_bound(m_v.begin(), m_v.end(), x); }
return static_cast<Real>(std::distance(m_v.begin(), it))/static_cast<Real>(m_v.size()); auto it = std::upper_bound(m_v.begin(), m_v.end(), x);
return static_cast<Real>(std::distance(m_v.begin(), it))/static_cast<Real>(m_v.size());
} }
} }

View File

@@ -951,6 +951,7 @@ test-suite misc :
[ run compile_test/catmull_rom_concept_test.cpp compile_test_main : : : [ requires cxx11_hdr_array cxx11_hdr_initializer_list ] ] [ run compile_test/catmull_rom_concept_test.cpp compile_test_main : : : [ requires cxx11_hdr_array cxx11_hdr_initializer_list ] ]
[ run ooura_fourier_integral_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run ooura_fourier_integral_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
[ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
[ run empirical_cumulative_distribution_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
[ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
[ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
[ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ] [ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]

View File

@@ -16,18 +16,54 @@
using boost::multiprecision::float128; using boost::multiprecision::float128;
#endif #endif
using boost::math::distributions::empirical_cumulative_distribution_function; using boost::math::empirical_cumulative_distribution_function;
template<class Z>
void test_uniform_z()
{
std::vector<Z> v{6,3,4,1,2,5};
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
CHECK_ULP_CLOSE(1.0/6.0, ecdf(1), 1);
CHECK_ULP_CLOSE(2.0/6.0, ecdf(2), 1);
CHECK_ULP_CLOSE(3.0/6.0, ecdf(3), 1);
CHECK_ULP_CLOSE(4.0/6.0, ecdf(4), 1);
CHECK_ULP_CLOSE(5.0/6.0, ecdf(5), 1);
CHECK_ULP_CLOSE(6.0/6.0, ecdf(6), 1);
// Less trivial:
v = {6,3,4,1,1,1,2,4};
ecdf = empirical_cumulative_distribution_function(std::move(v));
CHECK_ULP_CLOSE(3.0/8.0, ecdf(1), 1);
CHECK_ULP_CLOSE(4.0/8.0, ecdf(2), 1);
CHECK_ULP_CLOSE(5.0/8.0, ecdf(3), 1);
CHECK_ULP_CLOSE(7.0/8.0, ecdf(4), 1);
CHECK_ULP_CLOSE(7.0/8.0, ecdf(5), 1);
CHECK_ULP_CLOSE(8.0/8.0, ecdf(6), 1);
}
template<class Real> template<class Real>
void test_uniform() void test_uniform()
{ {
size_t n = 128;
std::vector<Real> v(n);
for (size_t i = 0; i < n; ++i) {
v[i] = Real(i+1)/Real(n);
}
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
for (size_t i = 0; i < n; ++i) {
CHECK_ULP_CLOSE(Real(i+1)/Real(n), ecdf(Real(i+1)/Real(n)), 1);
}
} }
int main() int main()
{ {
test_uniform_z<int>();
test_uniform<float>(); test_uniform<double>();
return boost::math::test::report_errors(); return boost::math::test::report_errors();
} }