From ca559e76fbf279ea01e08a908a04cd799bd4d6ac Mon Sep 17 00:00:00 2001
From: NAThompson <nathompson7@protonmail.com>
Date: Wed, 9 Oct 2019 10:31:52 -0400
Subject: [PATCH] Anderson-Darling: Use statistics/univariate_statistics.hpp.
 Also fix log(0) bug by returning sensible infinity in this case.

---
 .../math/statistics/anderson_darling.hpp      | 52 ++++++++++++-------
 1 file changed, 34 insertions(+), 18 deletions(-)
diff --git a/include/boost/math/statistics/anderson_darling.hpp b/include/boost/math/statistics/anderson_darling.hpp
index 94fef42a5..f892f27e0 100644
--- a/include/boost/math/statistics/anderson_darling.hpp
+++ b/include/boost/math/statistics/anderson_darling.hpp
@@ -10,7 +10,7 @@
 
 #include <cmath>
 #include <algorithm>
-#include <boost/math/tools/univariate_statistics.hpp>
+#include <boost/math/statistics/univariate_statistics.hpp>
 #include <boost/math/special_functions/erf.hpp>
 
 namespace boost { namespace math { namespace statistics {
@@ -26,45 +26,61 @@ auto anderson_darling_normality_statistic(RandomAccessContainer const & v,
     using boost::math::erfc;
 
     if (std::isnan(mu)) {
-        mu = boost::math::tools::mean(v);
+        mu = boost::math::statistics::mean(v);
     }
     if (std::isnan(sd)) {
-        sd = sqrt(boost::math::tools::sample_variance(v));
+        sd = sqrt(boost::math::statistics::sample_variance(v));
     }
 
+    typedef boost::math::policies::policy<
+          boost::math::policies::promote_float<false>,
+          boost::math::policies::promote_double<false> >
+          no_promote_policy;
+
     // This is where Knuth's literate programming could really come in handy!
     // I need some LaTeX. The idea is that before any observation, the ecdf is identically zero.
     // So we need to compute:
-    // \int_{0}^{v_0} \frac{F(x)F'(x)}{1- F(x)} \, \mathrm{d}x, where F(x) := \frac{1}{2}[1+\erf(\frac{x-\mu}{\sigma \sqrt{2}})]
+    // \int_{-\infty}^{v_0} \frac{F(x)F'(x)}{1- F(x)} \, \mathrm{d}x, where F(x) := \frac{1}{2}[1+\erf(\frac{x-\mu}{\sigma \sqrt{2}})]
     // Astonishingly, there is an analytic evaluation to this integral, as you can validate with the following Mathematica command:
     // Integrate[(1/2 (1 + Erf[(x - mu)/Sqrt[2*sigma^2]])*Exp[-(x - mu)^2/(2*sigma^2)]*1/Sqrt[2*\[Pi]*sigma^2])/(1 - 1/2 (1 + Erf[(x - mu)/Sqrt[2*sigma^2]])),
     // {x, -Infinity, x0}, Assumptions -> {x0 \[Element] Reals && mu \[Element] Reals && sigma > 0}]
+    // This gives (for s = x-mu/sqrt(2sigma^2))
+    // -1/2 + erf(s) + log(2/(1+erf(s)))
 
 
     Real inv_var_scale = 1/(sd*sqrt(Real(2)));
     Real s0 = (v[0] - mu)*inv_var_scale;
-    Real erfcs0 = erfc(s0);
-    // Here's the actual value of the tail integral:
-    //Real left_tail = -1 + erfcs0/2 + log(Real(2)) - log(erfcs0);
-    // But we're going to add erfcs0/2 at the first integral, so drop it here:
-    //Real left_tail = -1 + log(Real(2)) - log(erfcs0);
+    Real erfcs0 = erfc(s0, no_promote_policy());
+    // Note that if erfcs0 == 0, then left_tail = inf (numerically), and hence the entire integral is numerically infinite:
+    if (erfcs0 <= 0) {
+        return std::numeric_limits<Real>::infinity();
+    }
+
+    // Note that we're going to add erfcs0/2 when we compute the integral over [x_0, x_1], so drop it here:
     Real left_tail = -1 + log(Real(2));
+
+
     // For the right tail, the ecdf is identically 1.
     // Hence we need the integral:
     // \int_{v_{n-1}}^{\infty} \frac{(1-F(x))F'(x)}{F(x)} \, \mathrm{d}x
     // This also has an analytic evaluation! It can be found via the following Mathematica command:
     // Integrate[(E^(-(z^2/2)) *(1 - 1/2 (1 + Erf[z/Sqrt[2]])))/(Sqrt[2 \[Pi]] (1/2 (1 + Erf[z/Sqrt[2]]))),
     // {z, zn, \[Infinity]}, Assumptions -> {zn \[Element] Reals && mu \[Element] Reals}]
+    // This gives (for sf = xf-mu/sqrt(2sigma^2))
+    // -1/2 + erf(sf)/2 + 2log(2/(1+erf(sf)))
 
-    typedef boost::math::policies::policy<
-          boost::math::policies::promote_float<false>,
-          boost::math::policies::promote_double<false> >
-          no_promote_policy;
     Real sf = (v[v.size()-1] - mu)*inv_var_scale;
-    Real erfcsf = erfc<Real>(sf, no_promote_policy());
-    // This is the actual value of the tail integral. However, the -erfcsf/2 cancels from previous integral:
+    //Real erfcsf = erfc<Real>(sf, no_promote_policy());
+    // This is the actual value of the tail integral. However, the -erfcsf/2 cancels from the integral over [v_{n-2}, v_{n-1}]:
     //Real right_tail = -erfcsf/2 + log(Real(2)) - log(2-erfcsf);
-    Real right_tail = log(Real(2)) - log(2-erfcsf);
+
+    // Use erfc(-x) = 2 - erfc(x)
+    Real erfcmsf = erfc<Real>(-sf, no_promote_policy());
+    // Again if this is precisely zero then the integral is numerically infinite:
+    if (erfcmsf == 0) {
+        return std::numeric_limits<Real>::infinity();
+    }
+    Real right_tail = log(2/erfcmsf);
 
     // Now we need each integral:
     // \int_{v_i}^{v_{i+1}} \frac{(i+1/n - F(x))^2F'(x)}{F(x)(1-F(x))}  \, \mathrm{d}x
@@ -78,12 +94,12 @@ auto anderson_darling_normality_statistic(RandomAccessContainer const & v,
         if (v[i] > v[i+1]) {
             throw std::domain_error("Input data must be sorted in increasing order v[0] <= v[1] <= . . .  <= v[n-1]");
         }
+
         Real k = (i+1)/Real(N);
         Real s1 = (v[i+1]-mu)*inv_var_scale;
         Real erfcs1 = erfc<Real>(s1, no_promote_policy());
-
-        // Is there more optimization potential in this expression? I don't know!
         Real term = k*(k*log(erfcs0*(-2 + erfcs1)/(erfcs1*(-2 + erfcs0))) + 2*log(erfcs1/erfcs0));
+
         integrals += term;
         s0 = s1;
         erfcs0 = erfcs1;