2
0
mirror of https://github.com/boostorg/math.git synced 2026-01-19 04:22:09 +00:00

Implemented mode and associated tests (#390)

* Implemented mode and associated tests

* Clarity and complexity changes

* Added google benchmark

* Small changes to mode. More tests.

* Seperated into sorted and non-sorted functions

* Fixed data types and removed copying

* Fixed bounds checking

* Additional tests and cleanup.

* Added tests for std::list and std::forward_list

* Small testing changes and documentation

* Added modes memory allocation and faster insertion

* Changed return type, and modified tests

* Removed copied iterator and edge cases.

* Documentation fixes
This commit is contained in:
Matt Borland
2020-07-09 12:28:29 -05:00
committed by GitHub
parent 6bb0e8dee0
commit 699d326f64
4 changed files with 300 additions and 0 deletions

View File

@@ -90,6 +90,17 @@ namespace boost{ namespace math{ namespace statistics {
template<class ForwardIterator>
auto sample_gini_coefficient(ForwardIterator first, ForwardIterator last);
template<class ForwardIterator, class OutputIterator>
auto sorted_mode(ForwardIterator first, ForwardIterator last, OutputIterator output) -> decltype(output)
template<class Container, class OutputIterator>
inline auto sorted_mode(Container & v, OutputIterator output) -> decltype(output)
template<class RandomAccessIterator, class OutputIterator>
auto mode(RandomAccessIterator first, RandomAccessIterator last, OutputIterator output) -> decltype(output)
template<class RandomAccessContainer, class OutputIterator>
inline auto mode(RandomAccessContainer & v, OutputIterator output) -> decltype(output)
}}}
``
@@ -252,6 +263,38 @@ You should have /very/ good cause to pass negative values to the Gini coefficien
Another use case is found in signal processing, but the sorting is by magnitude and hence has a different implementation.
See `absolute_gini_coefficient` for details.
[heading Mode]
Compute the mode(s) of a data set:
std::vector<int> v {1, 3, 2, 2, 5, 4};
std::vector<int> modes;
boost::math::statistics::mode(v, std::back_inserter(modes));
// Mode is 2, modes.size() == 1
std::deque<int> d_modes;
std::array<int, 7> w {2, 2, 3, 1, 5, 4, 4};
boost::math::statistics::mode(w, std::back_inserter(d_modes));
// Modes are 2 and 4, d_modes.size() == 2
/Nota bene/: The input data is altered: in particular, it is sorted. Makes a call to `std::sort`, and as such requires random access iterators.
If your data is sorted, the following function can be used instead:
std::vector<int> v {1, 2, 2, 3, 4, 5};
std::vector<int> modes;
boost::math::statistics::sorted_mode(v, std::back_inserter(modes));
// Mode is 2, modes.size() == 1
std::deque<int> d_modes;
std::array<int, 7> w {1, 2, 2, 3, 4, 4, 5};
boost::math::statistics::sorted_mode(w, std::back_inserter(d_modes));
// Modes are 2 and 4, d_modes.size() == 2
/Nota bene/: The requirements for sorted_mode are reduced to forward iterators because there is no call to `std::sort`.
/Nota bene/: Passing unsorted data to sorted_mode is a bug.
For both mode, and sorted_mode the dataset must be of an integer type.
[heading References]
* Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002.

View File

@@ -512,6 +512,63 @@ inline auto interquartile_range(RandomAccessContainer & v)
return interquartile_range(v.begin(), v.end());
}
template<class ForwardIterator, class OutputIterator>
auto sorted_mode(ForwardIterator first, ForwardIterator last, OutputIterator output) -> decltype(output)
{
using Z = typename std::iterator_traits<ForwardIterator>::value_type;
static_assert(std::is_integral<Z>::value, "Floating point values have not yet been implemented.");
using Size = typename std::iterator_traits<ForwardIterator>::difference_type;
std::vector<Z> modes {};
modes.reserve(16);
Size max_counter {0};
while(first != last)
{
Size current_count {0};
auto end_it {first};
while(end_it != last && *end_it == *first)
{
++current_count;
++end_it;
}
if(current_count > max_counter)
{
modes.resize(1);
modes[0] = *first;
max_counter = current_count;
}
else if(current_count == max_counter)
{
modes.emplace_back(*first);
}
first = end_it;
}
return std::move(modes.begin(), modes.end(), output);
}
template<class Container, class OutputIterator>
inline auto sorted_mode(Container & v, OutputIterator output) -> decltype(output)
{
return sorted_mode(v.begin(), v.end(), output);
}
template<class RandomAccessIterator, class OutputIterator>
auto mode(RandomAccessIterator first, RandomAccessIterator last, OutputIterator output) -> decltype(output)
{
std::sort(first, last);
return sorted_mode(first, last, output);
}
template<class RandomAccessContainer, class OutputIterator>
inline auto mode(RandomAccessContainer & v, OutputIterator output) -> decltype(output)
{
return mode(v.begin(), v.end(), output);
}
}
#endif

View File

@@ -0,0 +1,130 @@
// (C) Copyright Nick Thompson and Matt Borland 2020.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <random>
#include <boost/math/statistics/univariate_statistics.hpp>
#include <benchmark/benchmark.h>
template <class Z>
void test_mode(benchmark::State& state)
{
using boost::math::statistics::sorted_mode;
std::random_device rd;
std::mt19937_64 mt(rd());
std::uniform_int_distribution<> dist {1, 10};
auto gen = [&dist, &mt](){return dist(mt);};
std::vector<Z> v(state.range(0));
std::generate(v.begin(), v.end(), gen);
for (auto _ : state)
{
std::vector<Z> modes;
benchmark::DoNotOptimize(sorted_mode(v.begin(), v.end(), std::back_inserter(modes)));
}
state.SetComplexityN(state.range(0));
}
template <class Z>
void sequential_test_mode(benchmark::State& state)
{
using boost::math::statistics::sorted_mode;
std::vector<Z> v(state.range(0));
size_t current_num {1};
// produces {1, 2, 3, 4, 5...}
for(size_t i {}; i < v.size(); ++i)
{
v[i] = current_num;
++current_num;
}
for (auto _ : state)
{
std::vector<Z> modes;
benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes)));
}
state.SetComplexityN(state.range(0));
}
template <class Z>
void sequential_pairs_test_mode(benchmark::State& state)
{
using boost::math::statistics::sorted_mode;
std::vector<Z> v(state.range(0));
size_t current_num {1};
size_t current_num_counter {};
// produces {1, 1, 2, 2, 3, 3, ...}
for(size_t i {}; i < v.size(); ++i)
{
v[i] = current_num;
++current_num_counter;
if(current_num_counter > 2)
{
++current_num;
current_num_counter = 0;
}
}
for (auto _ : state)
{
std::vector<Z> modes;
benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes)));
}
state.SetComplexityN(state.range(0));
}
template <class Z>
void sequential_multiple_test_mode(benchmark::State& state)
{
using boost::math::statistics::sorted_mode;
std::vector<Z> v(state.range(0));
size_t current_num {1};
size_t current_num_counter {};
// produces {1, 2, 2, 3, 3, 3, 4, 4, 4, 4, ...}
for(size_t i {}; i < v.size(); ++i)
{
v[i] = current_num;
++current_num_counter;
if(current_num_counter > current_num)
{
++current_num;
current_num_counter = 0;
}
}
for (auto _ : state)
{
std::vector<Z> modes;
benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes)));
}
state.SetComplexityN(state.range(0));
}
BENCHMARK_TEMPLATE(test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_pairs_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_pairs_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_pairs_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_multiple_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_multiple_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_TEMPLATE(sequential_multiple_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity();
BENCHMARK_MAIN();

View File

@@ -7,6 +7,7 @@
#include <vector>
#include <array>
#include <list>
#include <forward_list>
#include <algorithm>
#include <random>
@@ -833,6 +834,69 @@ void test_interquartile_range()
BOOST_TEST_EQ(iqr, 6);
}
template<class Z>
void test_mode()
{
std::vector<Z> modes;
std::vector<Z> v {1, 2, 2, 3, 4, 5};
const Z ref = 2;
// Does iterator call work?
boost::math::statistics::mode(v.begin(), v.end(), std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does container call work?
modes.clear();
boost::math::statistics::mode(v, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with part of a vector?
modes.clear();
boost::math::statistics::mode(v.begin(), v.begin() + 3, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with const qualification? Only if pre-sorted
modes.clear();
boost::math::statistics::sorted_mode(v.cbegin(), v.cend(), std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with std::array?
modes.clear();
std::array<Z, 6> u {1, 2, 2, 3, 4, 5};
boost::math::statistics::mode(u, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with a bi-modal distribuition?
modes.clear();
std::vector<Z> w {1, 2, 2, 3, 3, 4, 5};
boost::math::statistics::mode(w.begin(), w.end(), std::back_inserter(modes));
BOOST_TEST_EQ(modes.size(), 2);
// Does it work with an empty vector?
modes.clear();
std::vector<Z> x {};
boost::math::statistics::mode(x, std::back_inserter(modes));
BOOST_TEST_EQ(modes.size(), 0);
// Does it work with a one item vector
modes.clear();
x.push_back(2);
boost::math::statistics::mode(x, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with a doubly linked list
modes.clear();
std::list<Z> dl {1, 2, 2, 3, 4, 5};
boost::math::statistics::sorted_mode(dl, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
// Does it work with a singly linked list
modes.clear();
std::forward_list<Z> fl {1, 2, 2, 3, 4, 5};
boost::math::statistics::sorted_mode(fl, std::back_inserter(modes));
BOOST_TEST_EQ(ref, modes[0]);
}
int main()
{
test_mean<float>();
@@ -902,5 +966,11 @@ int main()
test_interquartile_range<double>();
test_interquartile_range<cpp_bin_float_50>();
test_mode<int>();
test_mode<int32_t>();
test_mode<int64_t>();
test_mode<uint32_t>();
return boost::report_errors();
}