diff --git a/doc/vector_functionals/signal_statistics.qbk b/doc/vector_functionals/signal_statistics.qbk index 15ae154ab..c1a913d69 100644 --- a/doc/vector_functionals/signal_statistics.qbk +++ b/doc/vector_functionals/signal_statistics.qbk @@ -46,10 +46,10 @@ namespace boost{ namespace math{ namespace tools { auto shannon_cost(ForwardIterator first, ForwardIterator last); template - auto oracle_snr(Container const & signal, Container const & noise); + auto oracle_snr(Container const & signal, Container const & noisy_signal); template - auto oracle_snr_db(Container const & signal, Container const & noise); + auto oracle_snr_db(Container const & signal, Container const & noisy_signal); template auto m2m4_snr_estimator(Container const & noisy_signal, typename Container::value_type estimated_signal_kurtosis=1, typename Container::value_type estimate_noise_kurtosis=3); @@ -73,41 +73,53 @@ For certain operations (total variation, for example) integer inputs are support The absolute median is used in signal processing, where the median of the magnitude of the coefficients in some expansion are used to estimate noise variance. See [@https://wavelet-tour.github.io/ Mallat] for details. -The absolute median supports both real and complex arithmetic, modifies its input, and requires random access iterators. +The absolute median supports both real and complex arithmetic, modifies its input, and requires random access containers. std::vector v{-1, 1}; - double m = boost::math::tools::absolute_median(v.begin(), v.end()); + double m = boost::math::tools::absolute_median(v); // m = 1 + // Alternative syntax, using a subset of the container: + m = boost::math::tools::absolute_median(v.begin(), v.begin() + 1); [heading Absolute Gini Coefficient] The Gini coefficient, first used to measure wealth inequality, is also one of the best measures of the sparsity of an expansion in a basis. A sparse expansion has most of its norm concentrated in just a few coefficients, making the connection with wealth inequality obvious. +ee [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard] for details. However, for measuring sparsity, the phase of the numbers is irrelevant, so we provide the `absolute_gini_coefficient`: + using boost::math::tools::absolute_gini_coefficient; std::vector> v{{0,1}, {0,0}, {0,0}, {0,0}}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); - // now abs_gini = 1 + double abs_gini = absolute_gini_coefficient(v); + // now abs_gini = 1; maximally unequal std::vector> w{{0,1}, {1,0}, {0,-1}, {-1,0}}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(w.begin(), w.end()); - // now abs_gini = 0 + abs_gini = absolute_gini_coefficient(w); + // now abs_gini = 0; every element of the vector has equal magnitude std::vector u{-1, 1, -1}; - double abs_gini = boost::math::tools::absolute_gini_coefficient(u.begin(), u.end()); + abs_gini = absolute_gini_coefficient(u); // now abs_gini = 0 + // Alternative call useful for computing over subset of the input: + abs_gini = absolute_gini_coefficient(u.begin(), u.begin() + 1); + + // If you need the population Gini coefficient: + double population_gini = (u.size() -1)*absolute_gini_coefficient(u)/u.size(); Wikipedia calls our scaling a "sample Gini coefficient". We chose this scaling because it always returns unity for a vector which has only one nonzero coefficient, whereas the value of the population Gini coefficient of a vector with one non-zero element is dependent on the length of the input. +Our scaling lacks one desirable property of the population Gini coefficient, namely that "cloning" a vector has the same Gini coefficient. +If you wish to recover the cloning property, convert to the population Gini coefficient. + If sorting the input data is too much expense for a sparsity measure (is it going to be perfect anyway?), consider calculating the Hoyer sparsity instead. [heading Hoyer Sparsity] The Hoyer sparsity measures a normalized ratio of the \u2113[super 1] and \u2113[super 2] norms. -As the name suggests, it is used to measure sparsity in an expansion in some basis. +As the name suggests, it is used to measure the sparsity of an expansion in some basis. The Hoyer sparsity computes ([radic]/N/ - \u2113[super 1](v)/\u2113[super 2](v))/([radic]N -1). For details, see [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard]. @@ -145,20 +157,18 @@ See [@https://doi.org/10.1007/978-3-642-56702-5 Ripples in Mathematics] for deta [heading Oracle Signal-to-noise ratio] -The function `oracle_snr` computes the ratio \u2016 /s/ \u2016[sub 2][super 2] / \u2016 /w/ \u2016[sub 2][super 2], where /s/ is signal and /w/ is noise. -The function `oracle_snr_db` computes 10`log`[sub 10](\u2016 /s/ \u2016[super 2] / \u2016 /w/ \u2016[super 2]). -In general, one does not know how to decompose a real signal /x/ into /s/ + /w/ and as such /s/ is regarded as oracle information. -Hence this function is mainly useful for unit testing other SNR measurements. +The function `oracle_snr` computes the ratio \u2016 /s/ \u2016[sub 2][super 2] / \u2016 /s/ - /x/ \u2016[sub 2][super 2], where /s/ is signal and /x/ is a noisy signal. +The function `oracle_snr_db` computes 10`log`[sub 10](\u2016 /s/ \u2016[super 2] / \u2016 /s/ - /x/ \u2016[super 2]). +The functions are so named because in general, one does not know how to decompose a real signal /x/ into /s/ + /w/ and as such /s/ is regarded as oracle information. +Hence this function is mainly useful for unit testing other SNR estimators. Usage: std::vector signal(500, 3.2); - std::vector noise(500); - // fill 'noise' with Gaussian white noise... - double snr_db = boost::math::tools::oracle_snr_db(signal, noise); - double snr = boost::math::tools::oracle_snr(signal, noise); - -The call should return the same value as [@https://www.mathworks.com/help/signal/ref/snr.html Matlab's `snr`]. + std::vector noisy_signal(500); + // fill 'noisy_signal' signal + noise + double snr_db = boost::math::tools::oracle_snr_db(signal, noisy_signal); + double snr = boost::math::tools::oracle_snr(signal, noisy_signal); The input can be real, complex, or integral. Integral inputs produce double precision floating point outputs. diff --git a/doc/vector_functionals/univariate_statistics.qbk b/doc/vector_functionals/univariate_statistics.qbk index d4d923f5e..0cd5086ba 100644 --- a/doc/vector_functionals/univariate_statistics.qbk +++ b/doc/vector_functionals/univariate_statistics.qbk @@ -149,13 +149,15 @@ Compute the Gini coefficient of a dataset: /Nota bene: The input data is altered-in particular, it is sorted./ /Nota bene:/ Different authors use different conventions regarding the overall scale of the Gini coefficient. -We have chosen to follow [@https://arxiv.org/pdf/0811.4706.pdf Hurley and Rickard's definition], which [@https://en.wikipedia.org/wiki/Gini_coefficient Wikipedia] calls a "sample Gini coefficient". -Hurley and Rickard's definition places the Gini coefficient in the range [0,1]; Wikipedia's population Gini coefficient is in the range [0, 1 - 1/ /n/]. -If you wish to convert the Boost Gini coefficient to the population Gini coefficient, multiply by (/n/-1)/ /n/. +We use [@https://en.wikipedia.org/wiki/Gini_coefficient Wikipedia's] "sample Gini coefficient". +The sample Gini coefficient lies in the range [0,1], whereas the population Gini coefficient is in the range [0, 1 - 1/ /n/]. +If you wish to convert the sample Gini coefficient returned by Boost to the population Gini coefficient, multiply by (/n/-1)/ /n/. /Nota bene:/ There is essentially no reason to pass negative values to the Gini coefficient function. However, a single use case (measuring wealth inequality when some people have negative wealth) exists, so we do not throw an exception when negative values are encountered. You should have /very/ good cause to pass negative values to the Gini coefficient calculator. +Another use case is found in signal processing, but the sorting is by magnitude and hence has a different implementation. +See `absolute_gini_coefficient` for details. [heading References] diff --git a/include/boost/math/tools/signal_statistics.hpp b/include/boost/math/tools/signal_statistics.hpp index 8e7c0c880..a82939469 100644 --- a/include/boost/math/tools/signal_statistics.hpp +++ b/include/boost/math/tools/signal_statistics.hpp @@ -96,6 +96,7 @@ inline auto shannon_cost(Container const & v) template auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last) { + using std::abs; using RealOrComplex = typename std::iterator_traits::value_type; BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples."); @@ -120,6 +121,8 @@ auto absolute_gini_coefficient(ForwardIterator first, ForwardIterator last) return zero; } return ((2*num)/denom - i)/(i-2); + + } template @@ -168,10 +171,11 @@ inline auto hoyer_sparsity(Container const & v) template -auto oracle_snr(Container const & signal, Container const & noise) +auto oracle_snr(Container const & signal, Container const & noisy_signal) { using Real = typename Container::value_type; - BOOST_ASSERT_MSG(signal.size() == noise.size(), "Signal and noise must be have the same number of elements."); + BOOST_ASSERT_MSG(signal.size() == noisy_signal.size(), + "Signal and noisy_signal must be have the same number of elements."); if constexpr (std::is_integral::value) { double numerator = 0; @@ -179,7 +183,7 @@ auto oracle_snr(Container const & signal, Container const & noise) for (size_t i = 0; i < signal.size(); ++i) { numerator += signal[i]*signal[i]; - denominator += noise[i]*noise[i]; + denominator += (noisy_signal[i] - signal[i])*(noisy_signal[i] - signal[i]); } if (numerator == 0 && denominator == 0) { @@ -201,7 +205,7 @@ auto oracle_snr(Container const & signal, Container const & noise) for (size_t i = 0; i < signal.size(); ++i) { numerator += norm(signal[i]); - denominator += norm(noise[i]); + denominator += norm(noisy_signal[i] - signal[i]); } if (numerator == 0 && denominator == 0) { @@ -221,7 +225,7 @@ auto oracle_snr(Container const & signal, Container const & noise) for (size_t i = 0; i < signal.size(); ++i) { numerator += signal[i]*signal[i]; - denominator += noise[i]*noise[i]; + denominator += (signal[i] - noisy_signal[i])*(signal[i] - noisy_signal[i]); } if (numerator == 0 && denominator == 0) { @@ -237,10 +241,10 @@ auto oracle_snr(Container const & signal, Container const & noise) } template -auto mean_invariant_oracle_snr(Container const & signal, Container const & noise) +auto mean_invariant_oracle_snr(Container const & signal, Container const & noisy_signal) { using Real = typename Container::value_type; - BOOST_ASSERT_MSG(signal.size() == noise.size(), "Signal and noise must be have the same number of elements."); + BOOST_ASSERT_MSG(signal.size() == noisy_signal.size(), "Signal and noise must be have the same number of elements."); Real mean = boost::math::tools::mean(signal); Real numerator = 0; @@ -249,7 +253,7 @@ auto mean_invariant_oracle_snr(Container const & signal, Container const & noise { Real tmp = signal[i] - mean; numerator += tmp*tmp; - denominator += noise[i]*noise[i]; + denominator += (signal[i] - noisy_signal[i])*(signal[i] - noisy_signal[i]); } if (numerator == 0 && denominator == 0) { @@ -264,21 +268,20 @@ auto mean_invariant_oracle_snr(Container const & signal, Container const & noise } -// Follows the definition of SNR given in Mallat, A Wavelet Tour of Signal Processing, equation 11.16. template -auto mean_invariant_oracle_snr_db(Container const & signal, Container const & noise) +auto mean_invariant_oracle_snr_db(Container const & signal, Container const & noisy_signal) { using std::log10; - return 10*log10(mean_invariant_oracle_snr(signal, noise)); + return 10*log10(mean_invariant_oracle_snr(signal, noisy_signal)); } // Follows the definition of SNR given in Mallat, A Wavelet Tour of Signal Processing, equation 11.16. template -auto oracle_snr_db(Container const & signal, Container const & noise) +auto oracle_snr_db(Container const & signal, Container const & noisy_signal) { using std::log10; - return 10*log10(oracle_snr(signal, noise)); + return 10*log10(oracle_snr(signal, noisy_signal)); } // A good reference on the M2M4 estimator: @@ -300,7 +303,6 @@ auto m2m4_snr_estimator(Container const & noisy_signal, typename Container::val // (ka+kw-6)S^2 + 2M2(3-kw)S + kw*M2^2 - M4 = 0 =: a*S^2 + bs*N + cs = 0 // If we first eliminate S, we obtain the quadratic equation: // (ka+kw-6)N^2 + 2M2(3-ka)N + ka*M2^2 - M4 = 0 =: a*N^2 + bn*N + cn = 0 - // We see that if kw=3, we have a special case, and if ka+kw=6, we have a special case. // I believe these equations are totally independent quadratics; // if one has a complex solution it is not necessarily the case that the other must also. // However, I can't prove that, so there is a chance that this does unnecessary work. diff --git a/test/signal_statistics_test.cpp b/test/signal_statistics_test.cpp index 83ea617aa..f22f6a9c7 100644 --- a/test/signal_statistics_test.cpp +++ b/test/signal_statistics_test.cpp @@ -29,17 +29,18 @@ using boost::math::constants::two_pi; * 3) Does it work with ublas and std::array? (Checking Eigen and Armadillo will make the CI system really unhappy.) * 4) Does it work with std::forward_list if a forward iterator is all that is required? * 5) Does it work with complex data if complex data is sensible? + * 6) Does it work with integer data if sensible? */ template void test_absolute_median() { - std::mt19937 g(12); std::vector v{-1, 2, -3, 4, -5, 6, -7}; Real m = boost::math::tools::absolute_median(v.begin(), v.end()); BOOST_TEST_EQ(m, 4); + std::mt19937 g(12); std::shuffle(v.begin(), v.end(), g); m = boost::math::tools::absolute_median(v); BOOST_TEST_EQ(m, 4); @@ -77,6 +78,17 @@ void test_absolute_median() std::array w{1, 2, -3}; m = boost::math::tools::absolute_median(w); BOOST_TEST_EQ(m, 2); + + // boost.ublas vector? + boost::numeric::ublas::vector u(6); + u[0] = 1; + u[1] = 2; + u[2] = -3; + u[3] = 1; + u[4] = 2; + u[5] = -3; + m = boost::math::tools::absolute_median(u); + BOOST_TEST_EQ(m, 2); } @@ -104,6 +116,12 @@ void test_complex_absolute_median() v = {{0, -1}}; m = boost::math::tools::absolute_median(v.begin(), v.end()); BOOST_TEST_EQ(m, 1); + + boost::numeric::ublas::vector w(1); + w[0] = {0, -1}; + m = boost::math::tools::absolute_median(w); + BOOST_TEST_EQ(m, 1); + } @@ -180,18 +198,19 @@ void test_complex_hoyer_sparsity() template void test_absolute_gini_coefficient() { + using boost::math::tools::absolute_gini_coefficient; Real tol = std::numeric_limits::epsilon(); std::vector v{-1,0,0}; - Real gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); + Real gini = absolute_gini_coefficient(v.begin(), v.end()); BOOST_TEST(abs(gini - 1) < tol); - gini = boost::math::tools::absolute_gini_coefficient(v); + gini = absolute_gini_coefficient(v); BOOST_TEST(abs(gini - 1) < tol); v[0] = 1; v[1] = -1; v[2] = 1; - gini = boost::math::tools::absolute_gini_coefficient(v.begin(), v.end()); + gini = absolute_gini_coefficient(v.begin(), v.end()); BOOST_TEST(abs(gini) < tol); std::vector> w(128); @@ -200,10 +219,43 @@ void test_absolute_gini_coefficient() { w[k] = exp(i*static_cast(k)/static_cast(w.size())); } - gini = boost::math::tools::absolute_gini_coefficient(w.begin(), w.end()); + gini = absolute_gini_coefficient(w.begin(), w.end()); BOOST_TEST(abs(gini) < tol); - // The Gini index is invariant under "cloning": If w = v \oplus v, then G(w) = G(v). + // The population Gini index is invariant under "cloning": If w = v \oplus v, then G(w) = G(v). + // We use the sample Gini index, so we need to rescale + std::vector u(1000); + std::mt19937 gen(35); + std::uniform_real_distribution dis(0, 50); + for (size_t i = 0; i < u.size()/2; ++i) + { + u[i] = dis(gen); + } + for (size_t i = 0; i < u.size()/2; ++i) + { + u[i + u.size()/2] = u[i]; + } + std::cout << std::setprecision(std::numeric_limits::digits10 + 1); + Real scale1 = (u.size() - 2)/static_cast(u.size()); + Real scale2 = (u.size() - 1)/static_cast(u.size()); + Real population_gini1 = scale1*absolute_gini_coefficient(u.begin(), u.begin() + u.size()/2); + Real population_gini2 = scale2*absolute_gini_coefficient(u.begin(), u.end()); + + BOOST_TEST(abs(population_gini1 - population_gini2) < 10*tol); + + // The Gini coefficient of a uniform distribution is (b-a)/(3*(b+a)), see https://en.wikipedia.org/wiki/Gini_coefficient + Real expected = (dis.b() - dis.a() )/(3*(dis.a() + dis.b())); + + BOOST_TEST(abs(expected - population_gini1) < 0.01); + + std::exponential_distribution exp_dis(1); + for (size_t i = 0; i < u.size(); ++i) + { + u[i] = exp_dis(gen); + } + population_gini2 = scale2*absolute_gini_coefficient(u); + + BOOST_TEST(abs(population_gini2 - 0.5) < 0.01); } template @@ -233,11 +285,11 @@ void test_oracle_snr() Real tol = 100*std::numeric_limits::epsilon(); size_t length = 100; std::vector signal(length, 1); - std::vector noise(length, 0); + std::vector noisy_signal = signal; - noise[0] = 1; - Real snr = boost::math::tools::oracle_snr(signal, noise); - Real snr_db = boost::math::tools::oracle_snr_db(signal, noise); + noisy_signal[0] += 1; + Real snr = boost::math::tools::oracle_snr(signal, noisy_signal); + Real snr_db = boost::math::tools::oracle_snr_db(signal, noisy_signal); BOOST_TEST(abs(snr - length) < tol); BOOST_TEST(abs(snr_db - 10*log10(length)) < tol); } @@ -249,11 +301,11 @@ void test_integer_oracle_snr() double tol = std::numeric_limits::epsilon(); size_t length = 100; std::vector signal(length, 1); - std::vector noise(length, 0); + std::vector noisy_signal = signal; - noise[0] = 1; - double snr = boost::math::tools::oracle_snr(signal, noise); - double snr_db = boost::math::tools::oracle_snr_db(signal, noise); + noisy_signal[0] += 1; + double snr = boost::math::tools::oracle_snr(signal, noisy_signal); + double snr_db = boost::math::tools::oracle_snr_db(signal, noisy_signal); BOOST_TEST(abs(snr - length) < tol); BOOST_TEST(abs(snr_db - 10*log10(length)) < tol); } @@ -267,11 +319,11 @@ void test_complex_oracle_snr() Real tol = 100*std::numeric_limits::epsilon(); size_t length = 100; std::vector signal(length, {1,0}); - std::vector noise(length, {0,0}); + std::vector noisy_signal = signal; - noise[0] = {1,0}; - Real snr = boost::math::tools::oracle_snr(signal, noise); - Real snr_db = boost::math::tools::oracle_snr_db(signal, noise); + noisy_signal[0] += Complex(1,0); + Real snr = boost::math::tools::oracle_snr(signal, noisy_signal); + Real snr_db = boost::math::tools::oracle_snr_db(signal, noisy_signal); BOOST_TEST(abs(snr - length) < tol); BOOST_TEST(abs(snr_db - 10*log10(length)) < tol); } @@ -280,31 +332,29 @@ template void test_m2m4_snr_estimator() { std::vector signal(5000, 1); - std::vector noise(signal.size()); std::vector x(signal.size()); std::mt19937 gen(18); std::normal_distribution dis{0, 1.0}; - for (size_t i = 0; i < noise.size(); ++i) { - signal[i] = 5*sin(100*6.28*i/noise.size()); - noise[i] = dis(gen); - x[i] = signal[i] + noise[i]; + for (size_t i = 0; i < x.size(); ++i) { + signal[i] = 5*sin(100*6.28*i/x.size()); + x[i] = signal[i] + dis(gen); } + // Kurtosis of a sine wave is 1.5: auto m2m4_db = boost::math::tools::m2m4_snr_estimator_db(x, 1.5); - auto oracle_snr_db = boost::math::tools::mean_invariant_oracle_snr_db(signal, noise); + auto oracle_snr_db = boost::math::tools::mean_invariant_oracle_snr_db(signal, x); BOOST_TEST(abs(m2m4_db - oracle_snr_db) < 0.2); std::uniform_real_distribution uni_dis{-1,1}; - for (size_t i = 0; i < noise.size(); ++i) + for (size_t i = 0; i < x.size(); ++i) { - noise[i] = uni_dis(gen); - x[i] = signal[i] + noise[i]; + x[i] = signal[i] + uni_dis(gen); } // Kurtosis of continuous uniform distribution over [-1,1] is 1.8: m2m4_db = boost::math::tools::m2m4_snr_estimator_db(x, 1.5, 1.8); - oracle_snr_db = boost::math::tools::mean_invariant_oracle_snr_db(signal, noise); + oracle_snr_db = boost::math::tools::mean_invariant_oracle_snr_db(signal, x); BOOST_TEST(abs(m2m4_db - oracle_snr_db) < 0.2); }