Fix variance calculation in accumulators::mean::operator+= (#308)

This commit is contained in:
Hans Dembinski
2021-03-16 13:17:40 +01:00
committed by GitHub
parent 7c24d2683e
commit de33fc74a4
3 changed files with 120 additions and 49 deletions

View File

@@ -31,16 +31,16 @@ public:
mean() = default;
/// Allow implicit conversion from mean<T>
/// Allow implicit conversion from mean<T>.
template <class T>
mean(const mean<T>& o) noexcept
: sum_{o.sum_}, mean_{o.mean_}, sum_of_deltas_squared_{o.sum_of_deltas_squared_} {}
/// Initialize to external count, mean, and variance
/// Initialize to external count, mean, and variance.
mean(const_reference n, const_reference mean, const_reference variance) noexcept
: sum_(n), mean_(mean), sum_of_deltas_squared_(variance * (n - 1)) {}
/// Insert sample x
/// Insert sample x.
void operator()(const_reference x) noexcept {
sum_ += static_cast<value_type>(1);
const auto delta = x - mean_;
@@ -48,7 +48,7 @@ public:
sum_of_deltas_squared_ += delta * (x - mean_);
}
/// Insert sample x with weight w
/// Insert sample x with weight w.
void operator()(const weight_type<value_type>& w, const_reference x) noexcept {
sum_ += w.value;
const auto delta = x - mean_;
@@ -56,18 +56,43 @@ public:
sum_of_deltas_squared_ += w.value * delta * (x - mean_);
}
/// Add another mean accumulator
/// Add another mean accumulator.
mean& operator+=(const mean& rhs) noexcept {
if (sum_ != 0 || rhs.sum_ != 0) {
const auto tmp = mean_ * sum_ + rhs.mean_ * rhs.sum_;
sum_ += rhs.sum_;
mean_ = tmp / sum_;
}
if (rhs.sum_ == 0) return *this;
/*
sum_of_deltas_squared
= sum_i (x_i - mu)^2
= sum_i (x_i - mu)^2 + sum_k (x_k - mu)^2
= sum_i (x_i - mu1 + (mu1 - mu))^2 + sum_k (x_k - mu2 + (mu2 - mu))^2
first part:
sum_i (x_i - mu1 + (mu1 - mu))^2
= sum_i (x_i - mu1)^2 + n1 (mu1 - mu))^2 + 2 (mu1 - mu) sum_i (x_i - mu1)
= sum_i (x_i - mu1)^2 + n1 (mu1 - mu))^2
since sum_i (x_i - mu1) = n1 mu1 - n1 mu1 = 0
Putting it together:
sum_of_deltas_squared
= sum_of_deltas_squared_1 + n1 (mu1 - mu))^2
+ sum_of_deltas_squared_2 + n2 (mu2 - mu))^2
*/
const auto mu1 = mean_;
const auto mu2 = rhs.mean_;
const auto n1 = sum_;
const auto n2 = rhs.sum_;
sum_ += rhs.sum_;
mean_ = (n1 * mu1 + n2 * mu2) / sum_;
sum_of_deltas_squared_ += rhs.sum_of_deltas_squared_;
sum_of_deltas_squared_ +=
n1 * (mean_ - mu1) * (mean_ - mu1) + n2 * (mean_ - mu2) * (mean_ - mu2);
return *this;
}
/** Scale by value
/** Scale by value.
This acts as if all samples were scaled by the value.
*/
@@ -84,7 +109,7 @@ public:
bool operator!=(const mean& rhs) const noexcept { return !operator==(rhs); }
/// Return how many samples were accumulated
/// Return how many samples were accumulated.
const_reference count() const noexcept { return sum_; }
/** Return mean value of accumulated samples.
@@ -93,7 +118,7 @@ public:
*/
const_reference value() const noexcept { return mean_; }
/** Return variance of accumulated samples
/** Return variance of accumulated samples.
The result is undefined, if `count() < 2`.
*/

View File

@@ -49,6 +49,7 @@ namespace algorithm {
template <class A, class S>
auto sum(const histogram<A, S>& hist, const coverage cov = coverage::all) {
using T = typename histogram<A, S>::value_type;
// T is arithmetic, compute sum accurately with high dynamic range
using sum_type = mp11::mp_if<std::is_arithmetic<T>, accumulators::sum<double>, T>;
sum_type sum;
if (cov == coverage::all)

View File

@@ -18,52 +18,97 @@ using namespace std::literals;
int main() {
using m_t = accumulators::mean<double>;
m_t a;
BOOST_TEST_EQ(a.count(), 0);
BOOST_TEST_EQ(a, m_t{});
a(4);
a(7);
a(13);
a(16);
// basic interface, string conversion
{
m_t a;
BOOST_TEST_EQ(a.count(), 0);
BOOST_TEST_EQ(a, m_t{});
BOOST_TEST_EQ(a.count(), 4);
BOOST_TEST_EQ(a.value(), 10);
BOOST_TEST_EQ(a.variance(), 30);
a(4);
a(7);
a(13);
a(16);
BOOST_TEST_EQ(str(a), "mean(4, 10, 30)"s);
BOOST_TEST_EQ(str(a, 20, false), " mean(4, 10, 30)"s);
BOOST_TEST_EQ(str(a, 20, true), "mean(4, 10, 30) "s);
BOOST_TEST_EQ(a.count(), 4);
BOOST_TEST_EQ(a.value(), 10);
BOOST_TEST_EQ(a.variance(), 30);
m_t b;
b(1e8 + 4);
b(1e8 + 7);
b(1e8 + 13);
b(1e8 + 16);
BOOST_TEST_EQ(str(a), "mean(4, 10, 30)"s);
BOOST_TEST_EQ(str(a, 20, false), " mean(4, 10, 30)"s);
BOOST_TEST_EQ(str(a, 20, true), "mean(4, 10, 30) "s);
}
BOOST_TEST_EQ(b.count(), 4);
BOOST_TEST_EQ(b.value(), 1e8 + 10);
BOOST_TEST_EQ(b.variance(), 30);
// small variation on large number
{
m_t a;
a(1e8 + 4);
a(1e8 + 7);
a(1e8 + 13);
a(1e8 + 16);
auto c = a;
c += a; // same as feeding all samples twice
BOOST_TEST_EQ(a.count(), 4);
BOOST_TEST_EQ(a.value(), 1e8 + 10);
BOOST_TEST_EQ(a.variance(), 30);
}
BOOST_TEST_EQ(c.count(), 8);
BOOST_TEST_EQ(c.value(), 10);
BOOST_TEST_IS_CLOSE(c.variance(), 25.714, 1e-3);
// addition of zero element
{
BOOST_TEST_EQ(m_t() += m_t(), m_t());
BOOST_TEST_EQ(m_t(1, 2, 3) += m_t(), m_t(1, 2, 3));
BOOST_TEST_EQ(m_t() += m_t(1, 2, 3), m_t(1, 2, 3));
}
// also same as feeding all samples twice
m_t d;
d(weight(2), 4);
d(weight(2), 7);
d(weight(2), 13);
d(weight(2), 16);
// addition
{
m_t a, b, c;
BOOST_TEST_EQ(d, c);
a(1);
a(2);
a(3);
BOOST_TEST_EQ(a.count(), 3);
BOOST_TEST_EQ(a.value(), 2);
BOOST_TEST_EQ(a.variance(), 1);
BOOST_TEST_EQ(m_t() += m_t(), m_t());
BOOST_TEST_EQ(m_t(1, 2, 3) += m_t(), m_t(1, 2, 3));
BOOST_TEST_EQ(m_t() += m_t(1, 2, 3), m_t(1, 2, 3));
b(4);
b(6);
BOOST_TEST_EQ(b.count(), 2);
BOOST_TEST_EQ(b.value(), 5);
BOOST_TEST_EQ(b.variance(), 2);
c(1);
c(2);
c(3);
c(4);
c(6);
auto d = a;
d += b;
BOOST_TEST_EQ(c.count(), d.count());
BOOST_TEST_EQ(c.value(), d.value());
BOOST_TEST_IS_CLOSE(c.variance(), d.variance(), 1e-3);
}
// using weight=2 must be same as adding all samples twice
{
m_t a, b;
for (int i = 0; i < 2; ++i) {
a(4);
a(7);
a(13);
a(16);
}
b(weight(2), 4);
b(weight(2), 7);
b(weight(2), 13);
b(weight(2), 16);
BOOST_TEST_EQ(a.count(), b.count());
BOOST_TEST_EQ(a.value(), b.value());
BOOST_TEST_IS_CLOSE(a.variance(), b.variance(), 1e-3);
}
return boost::report_errors();
}