mirror of
https://github.com/boostorg/math.git
synced 2026-01-19 04:22:09 +00:00
Empirical Cumulative Distribution function
This commit is contained in:
@@ -37,6 +37,7 @@
|
||||
[include triangular.qbk]
|
||||
[include uniform.qbk]
|
||||
[include weibull.qbk]
|
||||
[include empirical_cdf.qbk]
|
||||
|
||||
[endsect] [/section:dists Distributions]
|
||||
|
||||
@@ -138,10 +139,3 @@ opportunity to integrate the statistical tests with this framework at some later
|
||||
(See accompanying file LICENSE_1_0.txt or copy at
|
||||
http://www.boost.org/LICENSE_1_0.txt).
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
71
doc/distributions/empirical_cdf.qbk
Normal file
71
doc/distributions/empirical_cdf.qbk
Normal file
@@ -0,0 +1,71 @@
|
||||
[/
|
||||
Copyright (c) 2019 Nick Thompson
|
||||
Use, modification and distribution are subject to the
|
||||
Boost Software License, Version 1.0. (See accompanying file
|
||||
LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
]
|
||||
|
||||
[section:empirical_cdf Empirical Cumulative Distribution Function]
|
||||
|
||||
[heading Synopsis]
|
||||
|
||||
```
|
||||
#include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
|
||||
|
||||
namespace boost{ namespace math{
|
||||
|
||||
template <class RandomAccessContainer>
|
||||
class empirical_cumulative_distribution_function
|
||||
{
|
||||
public:
|
||||
using Real = typename RandomAccessContainer::value_type;
|
||||
empirical_cumulative_distribution_function(RandomAccessContainer && v);
|
||||
|
||||
auto operator()(Real t) const;
|
||||
};
|
||||
|
||||
}}
|
||||
```
|
||||
|
||||
[heading Empirical Cumulative Distribution Function]
|
||||
|
||||
The empirical cumulative distribution function is a step function constructed from observed data which converges to the true cumulative distribution function in the limit of infinite data.
|
||||
This function is a basic building block of hypothesis testing workflows that attempt to answer the question "does my data come from a given distribution?"
|
||||
These tests require computing quadratures over some function of the empirical CDF and the supposed CDF to create a distance measurement, and hence it is occasionally useful to construct a continuous callable from the data.
|
||||
|
||||
An example usage is demonstrated below:
|
||||
|
||||
```
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <boost/math/distributions/empirical_cumulative_distribution_function.hpp>
|
||||
using boost::math::empirical_cumulative_distribution_function;
|
||||
std::random_device rd;
|
||||
std::mt19937 gen{rd()};
|
||||
std::normal_distribution<double> dis(0, 1);
|
||||
size_t n = 128;
|
||||
std::vector<double> v(n);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
v[i] = dis(gen);
|
||||
}
|
||||
|
||||
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
|
||||
std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n";
|
||||
// should print approximately 0.5 . . .
|
||||
```
|
||||
|
||||
The empirical distribution function operates on sorted data.
|
||||
If the data are not already sorted, the constructor sorts it for you at O(Nlog(N)) cost.
|
||||
|
||||
Call operator complexity is O(log(N)).
|
||||
|
||||
Works with both integer and floating point types.
|
||||
If the input data consists of integers, the output of the call operator is a double. Requires C++17.
|
||||
|
||||
[$../graphs/empiricial_cumulative_distribution_gauss.svg]
|
||||
|
||||
[$../graphs/empiricial_cumulative_distribution_uniform.svg]
|
||||
|
||||
|
||||
[endsect]
|
||||
[/section:empirical_cdf]
|
||||
@@ -2,7 +2,7 @@
|
||||
<svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'>
|
||||
<style>svg { background-color: black; }
|
||||
</style>
|
||||
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) Gaussian distribution for n = 128 samples</text>
|
||||
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and continuous CDF (orange) of an 𝓝(0,1) distribution on 128 samples</text>
|
||||
<g transform='translate(25, 40)'>
|
||||
<line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' />
|
||||
<line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' />
|
||||
|
||||
|
Before Width: | Height: | Size: 29 KiB After Width: | Height: | Size: 29 KiB |
@@ -2,7 +2,7 @@
|
||||
<svg xmlns='http://www.w3.org/2000/svg' width='1100' height='679'>
|
||||
<style>svg { background-color: black; }
|
||||
</style>
|
||||
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and theoretical CDF (orange) of an dice roll distribution on n = 128 samples</text>
|
||||
<text x='550' y='20' font-family='Palatino' font-size='25' fill='white' alignment-baseline='middle' text-anchor='middle'>Empirical (blue) and theoretical CDF (orange) of the dice roll distribution on n = 128 samples</text>
|
||||
<g transform='translate(25, 40)'>
|
||||
<line x1='0' y1='0' x2='0' y2='619' stroke='gray' stroke-width='1' />
|
||||
<line x1='0' y1='619' x2='1055' y2='619' stroke='gray' stroke-width='1' />
|
||||
|
||||
|
Before Width: | Height: | Size: 29 KiB After Width: | Height: | Size: 29 KiB |
@@ -6,6 +6,7 @@
|
||||
#ifndef BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP
|
||||
#define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
||||
namespace boost { namespace math{
|
||||
|
||||
@@ -22,7 +23,8 @@ public:
|
||||
}
|
||||
|
||||
auto operator()(Real x) const {
|
||||
if constexpr (std::is_integral_v<Real>) {
|
||||
if constexpr (std::is_integral_v<Real>)
|
||||
{
|
||||
if (x < m_v[0]) {
|
||||
return double(0);
|
||||
}
|
||||
@@ -32,15 +34,16 @@ public:
|
||||
auto it = std::upper_bound(m_v.begin(), m_v.end(), x);
|
||||
return static_cast<double>(std::distance(m_v.begin(), it))/static_cast<double>(m_v.size());
|
||||
}
|
||||
else {
|
||||
if (x < m_v[0]) {
|
||||
return Real(0);
|
||||
}
|
||||
if (x >= m_v[m_v.size()-1]) {
|
||||
return Real(1);
|
||||
}
|
||||
auto it = std::upper_bound(m_v.begin(), m_v.end(), x);
|
||||
return static_cast<Real>(std::distance(m_v.begin(), it))/static_cast<Real>(m_v.size());
|
||||
else
|
||||
{
|
||||
if (x < m_v[0]) {
|
||||
return Real(0);
|
||||
}
|
||||
if (x >= m_v[m_v.size()-1]) {
|
||||
return Real(1);
|
||||
}
|
||||
auto it = std::upper_bound(m_v.begin(), m_v.end(), x);
|
||||
return static_cast<Real>(std::distance(m_v.begin(), it))/static_cast<Real>(m_v.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -951,6 +951,7 @@ test-suite misc :
|
||||
[ run compile_test/catmull_rom_concept_test.cpp compile_test_main : : : [ requires cxx11_hdr_array cxx11_hdr_initializer_list ] ]
|
||||
[ run ooura_fourier_integral_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
[ run univariate_statistics_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
[ run empirical_cumulative_distribution_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
[ run norms_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
[ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
[ run bivariate_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
|
||||
|
||||
@@ -16,18 +16,54 @@
|
||||
using boost::multiprecision::float128;
|
||||
#endif
|
||||
|
||||
using boost::math::distributions::empirical_cumulative_distribution_function;
|
||||
using boost::math::empirical_cumulative_distribution_function;
|
||||
|
||||
template<class Z>
|
||||
void test_uniform_z()
|
||||
{
|
||||
std::vector<Z> v{6,3,4,1,2,5};
|
||||
|
||||
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
|
||||
|
||||
CHECK_ULP_CLOSE(1.0/6.0, ecdf(1), 1);
|
||||
CHECK_ULP_CLOSE(2.0/6.0, ecdf(2), 1);
|
||||
CHECK_ULP_CLOSE(3.0/6.0, ecdf(3), 1);
|
||||
CHECK_ULP_CLOSE(4.0/6.0, ecdf(4), 1);
|
||||
CHECK_ULP_CLOSE(5.0/6.0, ecdf(5), 1);
|
||||
CHECK_ULP_CLOSE(6.0/6.0, ecdf(6), 1);
|
||||
|
||||
// Less trivial:
|
||||
|
||||
v = {6,3,4,1,1,1,2,4};
|
||||
ecdf = empirical_cumulative_distribution_function(std::move(v));
|
||||
CHECK_ULP_CLOSE(3.0/8.0, ecdf(1), 1);
|
||||
CHECK_ULP_CLOSE(4.0/8.0, ecdf(2), 1);
|
||||
CHECK_ULP_CLOSE(5.0/8.0, ecdf(3), 1);
|
||||
CHECK_ULP_CLOSE(7.0/8.0, ecdf(4), 1);
|
||||
CHECK_ULP_CLOSE(7.0/8.0, ecdf(5), 1);
|
||||
CHECK_ULP_CLOSE(8.0/8.0, ecdf(6), 1);
|
||||
}
|
||||
|
||||
template<class Real>
|
||||
void test_uniform()
|
||||
{
|
||||
size_t n = 128;
|
||||
std::vector<Real> v(n);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
v[i] = Real(i+1)/Real(n);
|
||||
}
|
||||
|
||||
auto ecdf = empirical_cumulative_distribution_function(std::move(v));
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
CHECK_ULP_CLOSE(Real(i+1)/Real(n), ecdf(Real(i+1)/Real(n)), 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
test_uniform<float>();
|
||||
test_uniform_z<int>();
|
||||
test_uniform<double>();
|
||||
return boost::math::test::report_errors();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user