diff --git a/doc/statistics/univariate_statistics.qbk b/doc/statistics/univariate_statistics.qbk index 0fc1da4e4..d50fb75ad 100644 --- a/doc/statistics/univariate_statistics.qbk +++ b/doc/statistics/univariate_statistics.qbk @@ -90,6 +90,17 @@ namespace boost{ namespace math{ namespace statistics { template auto sample_gini_coefficient(ForwardIterator first, ForwardIterator last); + template + auto sorted_mode(ForwardIterator first, ForwardIterator last, OutputIterator output) -> decltype(output) + + template + inline auto sorted_mode(Container & v, OutputIterator output) -> decltype(output) + + template + auto mode(RandomAccessIterator first, RandomAccessIterator last, OutputIterator output) -> decltype(output) + + template + inline auto mode(RandomAccessContainer & v, OutputIterator output) -> decltype(output) }}} `` @@ -252,6 +263,38 @@ You should have /very/ good cause to pass negative values to the Gini coefficien Another use case is found in signal processing, but the sorting is by magnitude and hence has a different implementation. See `absolute_gini_coefficient` for details. +[heading Mode] + +Compute the mode(s) of a data set: + + std::vector v {1, 3, 2, 2, 5, 4}; + std::vector modes; + boost::math::statistics::mode(v, std::back_inserter(modes)); + // Mode is 2, modes.size() == 1 + std::deque d_modes; + std::array w {2, 2, 3, 1, 5, 4, 4}; + boost::math::statistics::mode(w, std::back_inserter(d_modes)); + // Modes are 2 and 4, d_modes.size() == 2 + +/Nota bene/: The input data is altered: in particular, it is sorted. Makes a call to `std::sort`, and as such requires random access iterators. + +If your data is sorted, the following function can be used instead: + + std::vector v {1, 2, 2, 3, 4, 5}; + std::vector modes; + boost::math::statistics::sorted_mode(v, std::back_inserter(modes)); + // Mode is 2, modes.size() == 1 + std::deque d_modes; + std::array w {1, 2, 2, 3, 4, 4, 5}; + boost::math::statistics::sorted_mode(w, std::back_inserter(d_modes)); + // Modes are 2 and 4, d_modes.size() == 2 + +/Nota bene/: The requirements for sorted_mode are reduced to forward iterators because there is no call to `std::sort`. + +/Nota bene/: Passing unsorted data to sorted_mode is a bug. + +For both mode, and sorted_mode the dataset must be of an integer type. + [heading References] * Higham, Nicholas J. ['Accuracy and stability of numerical algorithms.] Vol. 80. Siam, 2002. diff --git a/include/boost/math/statistics/univariate_statistics.hpp b/include/boost/math/statistics/univariate_statistics.hpp index 9b98d33a9..d69fb8509 100644 --- a/include/boost/math/statistics/univariate_statistics.hpp +++ b/include/boost/math/statistics/univariate_statistics.hpp @@ -512,6 +512,63 @@ inline auto interquartile_range(RandomAccessContainer & v) return interquartile_range(v.begin(), v.end()); } +template +auto sorted_mode(ForwardIterator first, ForwardIterator last, OutputIterator output) -> decltype(output) +{ + using Z = typename std::iterator_traits::value_type; + static_assert(std::is_integral::value, "Floating point values have not yet been implemented."); + using Size = typename std::iterator_traits::difference_type; + + std::vector modes {}; + modes.reserve(16); + Size max_counter {0}; + + while(first != last) + { + Size current_count {0}; + auto end_it {first}; + while(end_it != last && *end_it == *first) + { + ++current_count; + ++end_it; + } + + if(current_count > max_counter) + { + modes.resize(1); + modes[0] = *first; + max_counter = current_count; + } + + else if(current_count == max_counter) + { + modes.emplace_back(*first); + } + + first = end_it; + } + + return std::move(modes.begin(), modes.end(), output); +} + +template +inline auto sorted_mode(Container & v, OutputIterator output) -> decltype(output) +{ + return sorted_mode(v.begin(), v.end(), output); +} + +template +auto mode(RandomAccessIterator first, RandomAccessIterator last, OutputIterator output) -> decltype(output) +{ + std::sort(first, last); + return sorted_mode(first, last, output); +} + +template +inline auto mode(RandomAccessContainer & v, OutputIterator output) -> decltype(output) +{ + return mode(v.begin(), v.end(), output); +} } #endif diff --git a/reporting/performance/test_mode.cpp b/reporting/performance/test_mode.cpp new file mode 100644 index 000000000..1d13ed82d --- /dev/null +++ b/reporting/performance/test_mode.cpp @@ -0,0 +1,130 @@ +// (C) Copyright Nick Thompson and Matt Borland 2020. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +template +void test_mode(benchmark::State& state) +{ + using boost::math::statistics::sorted_mode; + + std::random_device rd; + std::mt19937_64 mt(rd()); + std::uniform_int_distribution<> dist {1, 10}; + + auto gen = [&dist, &mt](){return dist(mt);}; + + std::vector v(state.range(0)); + std::generate(v.begin(), v.end(), gen); + + for (auto _ : state) + { + std::vector modes; + benchmark::DoNotOptimize(sorted_mode(v.begin(), v.end(), std::back_inserter(modes))); + } + + state.SetComplexityN(state.range(0)); +} + +template +void sequential_test_mode(benchmark::State& state) +{ + using boost::math::statistics::sorted_mode; + + std::vector v(state.range(0)); + + size_t current_num {1}; + // produces {1, 2, 3, 4, 5...} + for(size_t i {}; i < v.size(); ++i) + { + v[i] = current_num; + ++current_num; + } + + for (auto _ : state) + { + std::vector modes; + benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes))); + } + + state.SetComplexityN(state.range(0)); +} + +template +void sequential_pairs_test_mode(benchmark::State& state) +{ + using boost::math::statistics::sorted_mode; + + std::vector v(state.range(0)); + + size_t current_num {1}; + size_t current_num_counter {}; + // produces {1, 1, 2, 2, 3, 3, ...} + for(size_t i {}; i < v.size(); ++i) + { + v[i] = current_num; + ++current_num_counter; + if(current_num_counter > 2) + { + ++current_num; + current_num_counter = 0; + } + } + + for (auto _ : state) + { + std::vector modes; + benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes))); + } + + state.SetComplexityN(state.range(0)); +} + +template +void sequential_multiple_test_mode(benchmark::State& state) +{ + using boost::math::statistics::sorted_mode; + + std::vector v(state.range(0)); + + size_t current_num {1}; + size_t current_num_counter {}; + // produces {1, 2, 2, 3, 3, 3, 4, 4, 4, 4, ...} + for(size_t i {}; i < v.size(); ++i) + { + v[i] = current_num; + ++current_num_counter; + if(current_num_counter > current_num) + { + ++current_num; + current_num_counter = 0; + } + } + + for (auto _ : state) + { + std::vector modes; + benchmark::DoNotOptimize(sorted_mode(v, std::back_inserter(modes))); + } + + state.SetComplexityN(state.range(0)); +} + +BENCHMARK_TEMPLATE(test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_pairs_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_pairs_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_pairs_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_multiple_test_mode, int32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_multiple_test_mode, int64_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); +BENCHMARK_TEMPLATE(sequential_multiple_test_mode, uint32_t)->RangeMultiplier(2)->Range(1<<1, 1<<22)->Complexity(); + +BENCHMARK_MAIN(); diff --git a/test/univariate_statistics_test.cpp b/test/univariate_statistics_test.cpp index 11598c7ff..6cebfa163 100644 --- a/test/univariate_statistics_test.cpp +++ b/test/univariate_statistics_test.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -833,6 +834,69 @@ void test_interquartile_range() BOOST_TEST_EQ(iqr, 6); } +template +void test_mode() +{ + std::vector modes; + std::vector v {1, 2, 2, 3, 4, 5}; + const Z ref = 2; + + // Does iterator call work? + boost::math::statistics::mode(v.begin(), v.end(), std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does container call work? + modes.clear(); + boost::math::statistics::mode(v, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with part of a vector? + modes.clear(); + boost::math::statistics::mode(v.begin(), v.begin() + 3, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with const qualification? Only if pre-sorted + modes.clear(); + boost::math::statistics::sorted_mode(v.cbegin(), v.cend(), std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with std::array? + modes.clear(); + std::array u {1, 2, 2, 3, 4, 5}; + boost::math::statistics::mode(u, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with a bi-modal distribuition? + modes.clear(); + std::vector w {1, 2, 2, 3, 3, 4, 5}; + boost::math::statistics::mode(w.begin(), w.end(), std::back_inserter(modes)); + BOOST_TEST_EQ(modes.size(), 2); + + // Does it work with an empty vector? + modes.clear(); + std::vector x {}; + boost::math::statistics::mode(x, std::back_inserter(modes)); + BOOST_TEST_EQ(modes.size(), 0); + + // Does it work with a one item vector + modes.clear(); + x.push_back(2); + boost::math::statistics::mode(x, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with a doubly linked list + modes.clear(); + std::list dl {1, 2, 2, 3, 4, 5}; + boost::math::statistics::sorted_mode(dl, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); + + // Does it work with a singly linked list + modes.clear(); + std::forward_list fl {1, 2, 2, 3, 4, 5}; + boost::math::statistics::sorted_mode(fl, std::back_inserter(modes)); + BOOST_TEST_EQ(ref, modes[0]); +} + int main() { test_mean(); @@ -902,5 +966,11 @@ int main() test_interquartile_range(); test_interquartile_range(); + + test_mode(); + test_mode(); + test_mode(); + test_mode(); + return boost::report_errors(); }