2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Empirical cumulative distribution function: Add bool flag for sorted.

This commit is contained in:
NAThompson
2019-09-25 13:05:58 -04:00
parent ee555a0738
commit c76c2af1ce
2 changed files with 95 additions and 7 deletions

View File

@@ -19,9 +19,11 @@ class empirical_cumulative_distribution_function
{
public:
using Real = typename RandomAccessContainer::value_type;
empirical_cumulative_distribution_function(RandomAccessContainer && v);
empirical_cumulative_distribution_function(RandomAccessContainer && v, bool sorted = false);
auto operator()(Real t) const;
RandomAccessContainer&& return_data();
};
}}
@@ -54,10 +56,25 @@ std::cout << "ecdf(0.0) = " << ecdf(0.0) << "\n";
// should print approximately 0.5 . . .
```
The empirical distribution function operates on sorted data.
If the data are not already sorted, the constructor sorts it for you at O(Nlog(N)) cost.
The empirical distribution function requires sorted data.
By default, the constructor sorts it for you at O(Nlog(N)) cost.
Call operator complexity is O(log(N)).
If your data is already sorted, you can specify this and the constructor simply moves your data into the class:
```
std::sort(v.begin(), v.end());
auto ecdf = empirical_cumulative_distribution_function(std::move(v), /* already sorted = */ true);
```
If you want your data back after being done with the object, use
```
v = ecdf.return_data();
```
This operation invalidates `ecdf`; it can no longer be used.
The call operator complexity is O(log(N)), as it requires a call to `std::upper_bound`.
Works with both integer and floating point types.
If the input data consists of integers, the output of the call operator is a double. Requires C++17.
@@ -67,5 +84,70 @@ If the input data consists of integers, the output of the call operator is a dou
[$../graphs/empiricial_cumulative_distribution_uniform.svg]
[heading Performance]
```
------------------------------------------------------
Benchmark Time
------------------------------------------------------
ECDFConstructorSorted<double>/8 4.52 ns
ECDFConstructorSorted<double>/16 5.20 ns
ECDFConstructorSorted<double>/32 5.22 ns
ECDFConstructorSorted<double>/64 7.37 ns
ECDFConstructorSorted<double>/128 7.16 ns
ECDFConstructorSorted<double>/256 8.97 ns
ECDFConstructorSorted<double>/512 8.44 ns
ECDFConstructorSorted<double>/1024 9.07 ns
ECDFConstructorSorted<double>/2048 11.4 ns
ECDFConstructorSorted<double>/4096 12.6 ns
ECDFConstructorSorted<double>/8192 11.4 ns
ECDFConstructorSorted<double>/16384 16.0 ns
ECDFConstructorSorted<double>/32768 17.0 ns
ECDFConstructorSorted<double>/65536 19.5 ns
ECDFConstructorSorted<double>/131072 15.8 ns
ECDFConstructorSorted<double>/262144 17.9 ns
ECDFConstructorSorted<double>/524288 26.7 ns
ECDFConstructorSorted<double>/1048576 29.5 ns
ECDFConstructorSorted<double>/2097152 31.8 ns
ECDFConstructorSorted<double>/4194304 32.8 ns
ECDFConstructorSorted<double>/8388608 35.4 ns
ECDFConstructorSorted<double>/16777216 30.4 ns
ECDFConstructorSorted<double>_BigO 1.27 lgN
ECDFConstructorSorted<double>_RMS 20 %
ECDFConstructorUnsorted<double>/8 155 ns
ECDFConstructorUnsorted<double>/64 2095 ns
ECDFConstructorUnsorted<double>/512 22212 ns
ECDFConstructorUnsorted<double>/4096 220821 ns
ECDFConstructorUnsorted<double>/32768 1996380 ns
ECDFConstructorUnsorted<double>/262144 18916039 ns
ECDFConstructorUnsorted<double>/2097152 194250013 ns
ECDFConstructorUnsorted<double>/16777216 2281469424 ns
ECDFConstructorUnsorted<double>_BigO 5.65 NlgN
ECDFConstructorUnsorted<double>_RMS 6 %
Shuffle<double>/8 82.4 ns
Shuffle<double>/64 731 ns
Shuffle<double>/512 5876 ns
Shuffle<double>/4096 46864 ns
Shuffle<double>/32768 385265 ns
Shuffle<double>/262144 4663866 ns
Shuffle<double>/2097152 54686332 ns
Shuffle<double>/16777216 875309099 ns
Shuffle<double>_BigO 2.16 NlgN
Shuffle<double>_RMS 12 %
ECDFEvaluation<double>/8 48.6 ns
ECDFEvaluation<double>/64 61.3 ns
ECDFEvaluation<double>/512 85.1 ns
ECDFEvaluation<double>/4096 105 ns
ECDFEvaluation<double>/32768 131 ns
ECDFEvaluation<double>/262144 196 ns
ECDFEvaluation<double>/2097152 391 ns
ECDFEvaluation<double>/16777216 715 ns
ECDFEvaluation<double>_BigO 18.19 lgN
ECDFEvaluation<double>_RMS 60 %
```
The call to the unsorted constructor is in fact a little faster than indicated, as the data must be shuffled after being sorted in the benchmark.
This is itself a fairly expensive operation.
[endsect]
[/section:empirical_cdf]

View File

@@ -7,6 +7,7 @@
#define BOOST_MATH_DISTRIBUTIONS_EMPIRICAL_CUMULATIVE_DISTRIBUTION_FUNCTION_HPP
#include <algorithm>
#include <iterator>
#include <stdexcept>
namespace boost { namespace math{
@@ -14,10 +15,13 @@ template<class RandomAccessContainer>
class empirical_cumulative_distribution_function {
using Real = typename RandomAccessContainer::value_type;
public:
empirical_cumulative_distribution_function(RandomAccessContainer && v)
empirical_cumulative_distribution_function(RandomAccessContainer && v, bool sorted = false)
{
if (v.size() == 0) {
throw std::domain_error("At least one sample is required to compute an empirical CDF.");
}
m_v = std::move(v);
if (!std::is_sorted(m_v.begin(), m_v.end())) {
if (!sorted) {
std::sort(m_v.begin(), m_v.end());
}
}
@@ -47,7 +51,9 @@ public:
}
}
RandomAccessContainer&& return_data() {
return std::move(m_v);
}
private:
RandomAccessContainer m_v;