From 18ed6163760892672813024c5b4697bee3d33377 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Fri, 4 Sep 2020 08:48:51 -0400 Subject: [PATCH] Kolmogorov-Smirnov distribution (#422) * Kolmogorov-Smirnov distribution #421 Add a new distribution, kolmogorov_smirnov_distribution, which takes a parameter that represents the number of observations used in a Kolmogorov-Smirnov test. (The K-S test is a popular test for comparing two CDFs, but the test statistic is not implemented here.) This implementation includes Kolmogorov's original 1st order Taylor expansion. There is a literature on the distribution's other mathematical properties (higher order terms and exact version); this literature is summarized in the main header file for anyone who may want to expand the implementation later. The CDF is implemented using a Jacobi theta function, and the PDF is a hand-rolled derivative of that function. Quantiles plug the CDF and PDF into a Newton-Raphson iteration. The mean and variance have nice closed-form expressions, and the mode uses a dumb run-time maximizer. This commit includes graphs, a ULP plotter for the PDF, and the usual compilation and numerical tests. The test file is on the small side, but it integrates the distribution from zero to infinity, and covers the quantiles pretty well. As of now the numerical tests only verify self-consistency (e.g. distribution moments and CDF-quantile relations), so there's room to add some external checks. * Implement skewness for K-S distribution [CI SKIP] The third moment integrates nicely with the help of Apery's constant (zeta_three). Verify the result via quadrature. * Implement kurtosis for the K-S distribution Verify the result via quadrature. --- doc/distributions/dist_reference.qbk | 1 + doc/distributions/kolmogorov_smirnov.qbk | 122 + .../kolmogorov_smirnov_distribution.svg | 1 + .../kolmogorov_smirnov_test_statistic.svg | 1 + doc/graphs/dist_graphs.cpp | 25 +- doc/graphs/kolmogorov_smirnov_pdf.svg | 70 + doc/graphs/kolmogorov_smirnov_pdf_ulp.svg | 2551 +++++++++++++++++ doc/html/dist.html | 2 + .../dists/kolmogorov_smirnov_dist.html | 362 +++ doc/html/standalone_HTML.manifest | 1 + doc/math.qbk | 1 + include/boost/math/distributions.hpp | 1 + include/boost/math/distributions/fwd.hpp | 4 + .../math/distributions/kolmogorov_smirnov.hpp | 494 ++++ .../accuracy/plot_kolmogorov_smirnov_pdf.cpp | 38 + reporting/performance/test_distributions.cpp | 10 + test/Jamfile.v2 | 1 + .../dist_kolmogorov_smirnov_incl_test.cpp | 25 + .../distribution_concept_check.cpp | 1 + test/compile_test/instantiate.hpp | 2 + test/test_kolmogorov_smirnov.cpp | 102 + 21 files changed, 3811 insertions(+), 4 deletions(-) create mode 100644 doc/distributions/kolmogorov_smirnov.qbk create mode 100644 doc/equations/kolmogorov_smirnov_distribution.svg create mode 100644 doc/equations/kolmogorov_smirnov_test_statistic.svg create mode 100644 doc/graphs/kolmogorov_smirnov_pdf.svg create mode 100644 doc/graphs/kolmogorov_smirnov_pdf_ulp.svg create mode 100644 doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html create mode 100644 include/boost/math/distributions/kolmogorov_smirnov.hpp create mode 100644 reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp create mode 100644 test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp create mode 100644 test/test_kolmogorov_smirnov.cpp diff --git a/doc/distributions/dist_reference.qbk b/doc/distributions/dist_reference.qbk index 63fe12bf3..c225d1953 100644 --- a/doc/distributions/dist_reference.qbk +++ b/doc/distributions/dist_reference.qbk @@ -21,6 +21,7 @@ [include inverse_chi_squared.qbk] [include inverse_gamma.qbk] [include inverse_gaussian.qbk] +[include kolmogorov_smirnov.qbk] [include laplace.qbk] [include logistic.qbk] [include lognormal.qbk] diff --git a/doc/distributions/kolmogorov_smirnov.qbk b/doc/distributions/kolmogorov_smirnov.qbk new file mode 100644 index 000000000..c557f6fc4 --- /dev/null +++ b/doc/distributions/kolmogorov_smirnov.qbk @@ -0,0 +1,122 @@ +[section:kolmogorov_smirnov_dist Kolmogorov-Smirnov Distribution] + +``#include `` + + namespace boost{ namespace math{ + + template + class kolmogorov_smirnov_distribution; + + typedef kolmogorov_smirnov_distribution<> kolmogorov_smirnov; + + template + class kolmogorov_smirnov_distribution + { + public: + typedef RealType value_type; + typedef Policy policy_type; + + // Constructor: + kolmogorov_smirnov_distribution(RealType n); + + // Accessor to parameter: + RealType number_of_observations()const; + + }} // namespaces + +The Kolmogorov-Smirnov test in statistics compares two empirical distributions, +or an empirical distribution against any theoretical distribution.[footnote +[@https://en.wikipedia.org/wiki/Kolmogorov–Smirnov_test Wikipedia: +Kolmogorov-Smirnov test]] It makes use of a specific distribution which is +informally known in the literature as the Kolmogorv-Smirnov distribution, +implemented here. + +Formally, if /n/ observations are taken from a theoretical distribution /G(x)/, +and if /G/[sub /n/](/x/) represents the empirical CDF of those /n/ observations, +then the test statistic + +[equation kolmogorov_smirnov_test_statistic] [/ D_n = \sup_x|G(x)-\hat{G}(x)| ] + +will be distributed according to a Kolmogorov-Smirnov distribution parameterized by /n/. + +The exact form of a Kolmogorov-Smirnov distribution is the subject of a large, +decades-old literature.[footnote +[@https://www.jstatsoft.org/article/view/v039i11 Simard, R. and L'Ecuyer, P. +(2011) "Computing the Two-Sided Kolmogorov-Smirnov Distribution". Journal of +Statistical Software, vol. 39, no. 11.]] In the interest of simplicity, Boost +implements the first-order, limiting form of this distribution (the same form +originally identified by Kolmogorov[footnote Kolmogorov A (1933). "Sulla +determinazione empirica di una legge di distribuzione". G. Ist. Ital. Attuari. +4: 83–91.]), namely + +[equation kolmogorov_smirnov_distribution] +[/ \lim_{n \to \infty} F_n(x/\sqrt{n})=1+2\sum_{k=1}^\infty (-1)^k e^{-2k^2x^2} ] + +Note that while the exact distribution only has support over \[0, 1\], this +limiting form has positive mass above unity, particularly for small /n/. The +following graph illustrations how the distribution changes for different values +of /n/: + +[graph kolmogorov_smirnov_pdf] + +[h4 Member Functions] + + kolmogorov_smirnov_distribution(RealType n); + +Constructs a Kolmogorov-Smirnov distribution with /n/ observations. + +Requires n > 0, otherwise calls __domain_error. + + RealType number_of_observations()const; + +Returns the parameter /n/ from which this object was constructed. + +[h4 Non-member Accessors] + +All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] +that are generic to all distributions are supported: __usual_accessors. + +The domain of the random variable is \[0, +[infin]\]. + +[h4 Accuracy] + +The CDF of the Kolmogorov-Smirnov distribution is implemented in terms of the +fourth [link math_toolkit.jacobi_theta.jacobi_theta_overview Jacobi Theta +function]; please refer to the accuracy ULP plots for that function. + +The PDF is implemented separately, and the following ULP plot illustrates its +accuracy: + +[graph kolmogorov_smirnov_pdf_ulp] + +Because PDF values are simply scaled out and up by the square root of /n/, the +above plot is representative for all values of /n/. Note that for present +purposes, "accuracy" refers to deviations from the limiting approximation, +rather than deviations from the exact distribution. + +[h4 Implementation] + +In the following table, /n/ is the number of observations, /x/ is the random variable, +[pi] is Archimedes' constant, and [zeta](3) is Apéry's constant. + +[table +[[Function][Implementation Notes]] +[[cdf][Using the relation: cdf = __jacobi_theta4tau(0, 2*x*x/[pi])]] +[[pdf][Using a manual derivative of the CDF]] +[[cdf complement][ +When x*x*n == 0: 1 + +When 2*x*x*n <= [pi]: 1 - __jacobi_theta4tau(0, 2*x*x*n/[pi]) + +When 2*x*x*n > [pi]: -__jacobi_theta4m1tau(0, 2*x*x*n/[pi])]] +[[quantile][Using a Newton-Raphson iteration]] +[[quantile from the complement][Using a Newton-Raphson iteration]] +[[mode][Using a run-time PDF maximizer]] +[[mean][sqrt([pi]/2) * ln(2) / sqrt(n)]] +[[variance][([pi][super 2]/12 - [pi]/2*ln[super 2](2))/n]] +[[skewness][(9/16*sqrt([pi]/2)*[zeta](3)/n[super 3/2] - 3 * mean * variance - mean[super 2] * variance) / (variance[super 3/2])]] +[[kurtosis][(7/720*[pi][super 4]/n[super 2] - 4 * mean * skewness * variance[super 3/2] - 6 * mean[super 2] * variance - mean[super 4]) / (variance[super 2])]] +] + +[endsect] [/section:kolmogorov_smirnov_dist Kolmogorov-Smirnov] diff --git a/doc/equations/kolmogorov_smirnov_distribution.svg b/doc/equations/kolmogorov_smirnov_distribution.svg new file mode 100644 index 000000000..4233fe70d --- /dev/null +++ b/doc/equations/kolmogorov_smirnov_distribution.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/equations/kolmogorov_smirnov_test_statistic.svg b/doc/equations/kolmogorov_smirnov_test_statistic.svg new file mode 100644 index 000000000..8c72cefa5 --- /dev/null +++ b/doc/equations/kolmogorov_smirnov_test_statistic.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/graphs/dist_graphs.cpp b/doc/graphs/dist_graphs.cpp index bcf0bb23a..611c357ca 100644 --- a/doc/graphs/dist_graphs.cpp +++ b/doc/graphs/dist_graphs.cpp @@ -84,8 +84,9 @@ public: m_distributions.push_back(std::make_pair(name, d)); // // Get the extent of the distribution from the support: - double a, b; - std::tr1::tie(a, b) = support(d); + std::pair r = support(d); + double a = r.first; + double b = r.second; // // PDF maximum is at the mode (probably): double mod; @@ -253,7 +254,7 @@ public: if(!is_discrete_distribution::value) { // Continuous distribution: - for(std::list >::const_iterator i = m_distributions.begin(); + for(typename std::list >::const_iterator i = m_distributions.begin(); i != m_distributions.end(); ++i) { double x = m_min_x; @@ -280,7 +281,7 @@ public: // Discrete distribution: double x_width = 0.75 / m_distributions.size(); double x_off = -0.5 * 0.75; - for(std::list >::const_iterator i = m_distributions.begin(); + for(typename std::list >::const_iterator i = m_distributions.begin(); i != m_distributions.end(); ++i) { double x = ceil(m_min_x); @@ -469,6 +470,22 @@ int main() fisher_f_plotter.add(boost::math::fisher_f_distribution<>(4, 10), "n=4, m=10"); fisher_f_plotter.plot("F Distribution PDF", "fisher_f_pdf.svg"); + distribution_plotter > + kolmogorov_smirnov_cdf_plotter(false); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); + kolmogorov_smirnov_cdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); + kolmogorov_smirnov_cdf_plotter.plot("Kolmogorov-Smirnov Distribution CDF", "kolmogorov_smirnov_cdf.svg"); + + distribution_plotter > + kolmogorov_smirnov_pdf_plotter; + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(1), "n=1"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(2), "n=2"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(5), "n=5"); + kolmogorov_smirnov_pdf_plotter.add(boost::math::kolmogorov_smirnov_distribution<>(10), "n=10"); + kolmogorov_smirnov_pdf_plotter.plot("Kolmogorov-Smirnov Distribution PDF", "kolmogorov_smirnov_pdf.svg"); + distribution_plotter > lognormal_plotter; lognormal_plotter.add(boost::math::lognormal_distribution<>(-1), "location=-1"); diff --git a/doc/graphs/kolmogorov_smirnov_pdf.svg b/doc/graphs/kolmogorov_smirnov_pdf.svg new file mode 100644 index 000000000..46dcb98aa --- /dev/null +++ b/doc/graphs/kolmogorov_smirnov_pdf.svg @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0 +0.5 +1 +1.5 +0 + +0 +1 +2 +3 +4 +5 +0 + +Probability + +Random Variable + + + + + + + + + +n=1 +n=2 +n=5 +n=10 + +Kolmogorov-Smirnov Distribution PDF + + + diff --git a/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg b/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg new file mode 100644 index 000000000..f1adef5dc --- /dev/null +++ b/doc/graphs/kolmogorov_smirnov_pdf_ulp.svg @@ -0,0 +1,2551 @@ + + + +Kolmogorov-Smirnov PDF (N=10) ULP plot at float precision + + + + +-75 + +-50 + +-25 + +0 + +25 + +50 + +75 + +100 + +0.1 + +0.2 + +0.3 + +0.4 + +0.5 + +0.6 + +0.7 + +0.8 + +0.9 + +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/html/dist.html b/doc/html/dist.html index 6dfd1bd00..4d84eb986 100644 --- a/doc/html/dist.html +++ b/doc/html/dist.html @@ -169,6 +169,8 @@ Gamma Distribution
Inverse Gaussian (or Inverse Normal) Distribution
+
Kolmogorov-Smirnov + Distribution
Laplace Distribution
Logistic Distribution
diff --git a/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html b/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html new file mode 100644 index 000000000..ff44b4c37 --- /dev/null +++ b/doc/html/math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html @@ -0,0 +1,362 @@ + + + +Kolmogorov-Smirnov Distribution + + + + + + + + + + + + + + + +
Boost C++ LibrariesHomeLibrariesPeopleFAQMore
+
+
+PrevUpHomeNext +
+
+ + +
#include <boost/math/distributions/kolmogorov_smirnov.hpp>
+
namespace boost{ namespace math{
+
+template <class RealType = double,
+          class Policy   = policies::policy<> >
+class kolmogorov_smirnov_distribution;
+
+typedef kolmogorov_smirnov_distribution<> kolmogorov_smirnov;
+
+template <class RealType, class Policy>
+class kolmogorov_smirnov_distribution
+{
+public:
+   typedef RealType  value_type;
+   typedef Policy    policy_type;
+
+   // Constructor:
+   kolmogorov_smirnov_distribution(RealType n);
+
+   // Accessor to parameter:
+   RealType number_of_observations()const;
+
+}} // namespaces
+
+

+ The Kolmogorov-Smirnov test in statistics compares two empirical distributions, + or an empirical distribution against any theoretical distribution.[1] It makes use of a specific distribution which is informally + known in the literature as the Kolmogorv-Smirnov distribution, implemented + here. +

+

+ Formally, if n observations are taken from a theoretical + distribution G(x), and if Gn(x) + represents the empirical CDF of those n observations, + then the test statistic +

+
+

+ + +

+
+

+ will be distributed according to a Kolmogorov-Smirnov distribution parameterized + by n. +

+

+ The exact form of a Kolmogorov-Smirnov distribution is the subject of a + large, decades-old literature.[2] In the interest of simplicity, Boost implements the first-order, + limiting form of this distribution (the same form originally identified + by Kolmogorov[3]), namely +

+
+

+ + +

+
+

+ Note that while the exact distribution only has support over [0, 1], this + limiting form has positive mass above unity, particlularly for small n. + The following graph illustrations how the distribution changes for different + values of n: +

+
+

+ + +

+
+
+ + Member + Functions +
+
kolmogorov_smirnov_distribution(RealType n);
+
+

+ Constructs a Kolmogorov-Smirnov distribution with n + observations. +

+

+ Requires n > 0, otherwise calls domain_error. +

+
RealType number_of_observations()const;
+
+

+ Returns the parameter n from which this object was + constructed. +

+
+ + Non-member + Accessors +
+

+ All the usual non-member accessor + functions that are generic to all distributions are supported: + Cumulative Distribution Function, + Probability Density Function, + Quantile, Hazard Function, Cumulative Hazard Function, + mean, median, + mode, variance, + standard deviation, + skewness, kurtosis, kurtosis_excess, + range and support. +

+

+ The domain of the random variable is [0, +∞]. +

+
+ + Accuracy +
+

+ The CDF of the Kolmogorov-Smirnov distribution is implemented in terms + of the fourth Jacobi + Theta function; please refer to the accuracy ULP plots for that + function. +

+

+ The PDF is implemented separately, and the following ULP plot illustrates + its accuracy: +

+
+

+ + +

+
+

+ Because PDF values are simply scaled out and up by the square root of + n, the above plot is representative for all values + of n. Note that for present purposes, "accuracy" + refers to deviations from the limiting approximation, rather than deviations + from the exact distribution. +

+
+ + Implementation +
+

+ In the following table, n is the number of observations, + x is the random variable, π is Archimedes' constant, + and ζ(3) is Apéry's constant. +

+
+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ Function +

+
+

+ Implementation Notes +

+
+

+ cdf +

+
+

+ Using the relation: cdf = jacobi_theta4tau(0, + 2*x*x/π) +

+
+

+ pdf +

+
+

+ Using a manual derivative of the CDF +

+
+

+ cdf complement +

+
+

+ When x*x*n == 0: 1 +

+

+ When 2*x*x*n <= π: 1 - jacobi_theta4tau(0, + 2*x*x*n/π) +

+

+ When 2*x*x*n > π: -jacobi_theta4m1tau(0, + 2*x*x*n/π) +

+
+

+ quantile +

+
+

+ Using a Newton-Raphson iteration +

+
+

+ quantile from the complement +

+
+

+ Using a Newton-Raphson iteration +

+
+

+ mode +

+
+

+ Using a run-time PDF maximizer +

+
+

+ mean +

+
+

+ sqrt(π/2) * ln(2) / sqrt(n) +

+
+

+ variance +

+
+

+ (π2/12 - π/2*ln2(2))/n +

+
+

+ skewness +

+
+

+ (9/16*sqrt(π/2)*ζ(3)/n3/2 - 3 * mean * variance - mean2 * variance) + / (variance3/2) +

+
+

+ kurtosis +

+
+

+ (7/720*π4/n2 - 4 * mean * skewness * variance3/2 - 6 * mean2 * variance + - mean4) / (variance2) +

+
+
+
+

+ + +
+

[3] + Kolmogorov A (1933). "Sulla determinazione empirica di una legge + di distribuzione". G. Ist. Ital. Attuari. 4: 83–91. +

+
+
+
+ + + +
+
+
+PrevUpHomeNext +
+ + diff --git a/doc/html/standalone_HTML.manifest b/doc/html/standalone_HTML.manifest index 003b88ea9..614cce497 100644 --- a/doc/html/standalone_HTML.manifest +++ b/doc/html/standalone_HTML.manifest @@ -130,6 +130,7 @@ math_toolkit/dist_ref/dists/hypergeometric_dist.html math_toolkit/dist_ref/dists/inverse_chi_squared_dist.html math_toolkit/dist_ref/dists/inverse_gamma_dist.html math_toolkit/dist_ref/dists/inverse_gaussian_dist.html +math_toolkit/dist_ref/dists/kolmogorov_smirnov_dist.html math_toolkit/dist_ref/dists/laplace_dist.html math_toolkit/dist_ref/dists/logistic_dist.html math_toolkit/dist_ref/dists/lognormal_dist.html diff --git a/doc/math.qbk b/doc/math.qbk index 984140d84..847b33e6e 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -387,6 +387,7 @@ and use the function's name as the link text.] [def __inverse_gamma_distrib [link math_toolkit.dist_ref.dists.inverse_gamma_dist Inverse Gamma Distribution]] [def __inverse_gaussian_distrib [link math_toolkit.dist_ref.dists.inverse_gaussian_dist Inverse Gaussian Distribution]] [def __inverse_chi_squared_distrib [link math_toolkit.dist_ref.dists.inverse_chi_squared_dist Inverse chi squared Distribution]] +[def __kolmogorov_smirnov_distrib [link math_toolkit.dist_ref.dists.kolmogorov_smirnov Kolmogorov-Smirnov Distribution]] [def __laplace_distrib [link math_toolkit.dist_ref.dists.laplace_dist Laplace Distribution]] [def __logistic_distrib [link math_toolkit.dist_ref.dists.logistic_dist Logistic Distribution]] [def __lognormal_distrib [link math_toolkit.dist_ref.dists.lognormal_dist Log-normal Distribution]] diff --git a/include/boost/math/distributions.hpp b/include/boost/math/distributions.hpp index 300f2312c..64da99415 100644 --- a/include/boost/math/distributions.hpp +++ b/include/boost/math/distributions.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/include/boost/math/distributions/fwd.hpp b/include/boost/math/distributions/fwd.hpp index 5b212c8ec..a3c1a41df 100644 --- a/include/boost/math/distributions/fwd.hpp +++ b/include/boost/math/distributions/fwd.hpp @@ -63,6 +63,9 @@ class inverse_gamma_distribution; template class inverse_gaussian_distribution; +template +class kolmogorov_smirnov_distribution; + template class laplace_distribution; @@ -129,6 +132,7 @@ class weibull_distribution; typedef boost::math::gamma_distribution gamma;\ typedef boost::math::geometric_distribution geometric;\ typedef boost::math::hypergeometric_distribution hypergeometric;\ + typedef boost::math::kolmogorov_smirnov_distribution kolmogorov_smirnov;\ typedef boost::math::inverse_chi_squared_distribution inverse_chi_squared;\ typedef boost::math::inverse_gaussian_distribution inverse_gaussian;\ typedef boost::math::inverse_gamma_distribution inverse_gamma;\ diff --git a/include/boost/math/distributions/kolmogorov_smirnov.hpp b/include/boost/math/distributions/kolmogorov_smirnov.hpp new file mode 100644 index 000000000..1dc0a92cc --- /dev/null +++ b/include/boost/math/distributions/kolmogorov_smirnov.hpp @@ -0,0 +1,494 @@ +// Kolmogorov-Smirnov 1st order asymptotic distribution +// Copyright Evan Miller 2020 +// +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// The Kolmogorov-Smirnov test in statistics compares two empirical distributions, +// or an empirical distribution against any theoretical distribution. It makes +// use of a specific distribution which doesn't have a formal name, but which +// is often called the Kolmogorv-Smirnov distribution for lack of anything +// better. This file implements the limiting form of this distribution, first +// identified by Andrey Kolmogorov in +// +// Kolmogorov, A. (1933) “Sulla Determinazione Empirica di una Legge di +// Distribuzione.” Giornale dell’ Istituto Italiano degli Attuari +// +// This limiting form of the CDF is a first-order Taylor expansion that is +// easily implemented by the fourth Jacobi Theta function (setting z=0). The +// PDF is then implemented here as a derivative of the Theta function. Note +// that this derivative is with respect to x, which enters into \tau, and not +// with respect to the z argument, which is always zero, and so the derivative +// identities in DLMF 20.4 do not apply here. +// +// A higher order order expansion is possible, and was first outlined by +// +// Pelz W, Good IJ (1976). “Approximating the Lower Tail-Areas of the +// Kolmogorov-Smirnov One-sample Statistic.” Journal of the Royal Statistical +// Society B. +// +// The terms in this expansion get fairly complicated, and as far as I know the +// Pelz-Good expansion is not used in any statistics software. Someone could +// consider updating this implementation to use the Pelz-Good expansion in the +// future, but the math gets considerably hairier with each additional term. +// +// A formula for an exact version of the Kolmogorov-Smirnov test is laid out in +// Equation 2.4.4 of +// +// Durbin J (1973). “Distribution Theory for Tests Based on the Sample +// Distribution Func- tion.” In SIAM CBMS-NSF Regional Conference Series in +// Applied Mathematics. SIAM, Philadelphia, PA. +// +// which is available in book form from Amazon and others. This exact version +// involves taking powers of large matrices. To do that right you need to +// compute eigenvalues and eigenvectors, which are beyond the scope of Boost. +// (Some recent work indicates the exact form can also be computed via FFT, see +// https://cran.r-project.org/web/packages/KSgeneral/KSgeneral.pdf). +// +// Even if the CDF of the exact distribution could be computed using Boost +// libraries (which would be cumbersome), the PDF would present another +// difficulty. Therefore I am limiting this implementation to the asymptotic +// form, even though the exact form has trivial values for certain specific +// values of x and n. For more on trivial values see +// +// Ruben H, Gambino J (1982). “The Exact Distribution of Kolmogorov’s Statistic +// Dn for n ≤ 10.” Annals of the Institute of Statistical Mathematics. +// +// For a good bibliography and overview of the various algorithms, including +// both exact and asymptotic forms, see +// https://www.jstatsoft.org/article/view/v039i11 +// +// As for this implementation: the distribution is parameterized by n (number +// of observations) in the spirit of chi-squared's degrees of freedom. It then +// takes a single argument x. In terms of the Kolmogorov-Smirnov statistical +// test, x represents the distribution of D_n, where D_n is the maximum +// difference between the CDFs being compared, that is, +// +// D_n = sup|F_n(x) - G(x)| +// +// In the exact distribution, x is confined to the support [0, 1], but in this +// limiting approximation, we allow x to exceed unity (similar to how a normal +// approximation always spills over any boundaries). +// +// As mentioned previously, the CDF is implemented using the \tau +// parameterization of the fourth Jacobi Theta function as +// +// CDF=θ₄(0|2*x*x*n/pi) +// +// The PDF is a hand-coded derivative of that function. Actually, there are two +// (independent) derivatives, as separate code paths are used for "small x" +// (2*x*x*n < pi) and "large x", mirroring the separate code paths in the +// Jacobi Theta implementation to achieve fast convergence. Quantiles are +// computed using a Newton-Raphson iteration from an initial guess that I +// arrived at by trial and error. +// +// The mean and variance are implemented using simple closed-form expressions. +// Skewness and kurtosis use slightly more complicated closed-form expressions +// that involve the zeta function. The mode is calculated at run-time by +// maximizing the PDF. If you have an analytical solution for the mode, feel +// free to plop it in. +// +// The CDF and PDF could almost certainly be re-implemented and sped up using a +// polynomial or rational approximation, since the only meaningful argument is +// x * sqrt(n). But that is left as an exercise for the next maintainer. +// +// In the future, the Pelz-Good approximation could be added. I suggest adding +// a second parameter representing the order, e.g. +// +// kolmogorov_smirnov_dist<>(100) // N=100, order=1 +// kolmogorov_smirnov_dist<>(100, 1) // N=100, order=1, i.e. Kolmogorov's formula +// kolmogorov_smirnov_dist<>(100, 4) // N=100, order=4, i.e. Pelz-Good formula +// +// The exact distribution could be added to the API with a special order +// parameter (e.g. 0 or infinity), or a separate distribution type altogether +// (e.g. kolmogorov_smirnov_exact_distribution). +// +#ifndef BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP +#define BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP + +#include +#include +#include +#include +#include +#include // Newton-Raphson +#include // For the mode + +namespace boost { namespace math { + +namespace detail { +template +inline RealType kolmogorov_smirnov_quantile_guess(RealType p) { + // Choose a starting point for the Newton-Raphson iteration + if (p > 0.9) + return RealType(1.8) - 5 * (1 - p); + if (p < 0.3) + return p + RealType(0.45); + return p + RealType(0.3); +} + +// d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI)) +template +RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) { + BOOST_MATH_STD_USING + RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0); + RealType eps = policies::get_epsilon(); + int i = 0; + RealType pi2 = constants::pi_sqr(); + RealType x2n = x*x*n; + if (x2n*x2n == 0.0) { + return static_cast(0); + } + while (1) { + delta = exp(-RealType(i+0.5)*RealType(i+0.5)*pi2/(2*x2n)) * (RealType(i+0.5)*RealType(i+0.5)*pi2 - x2n); + + if (delta == 0.0) + break; + + if (last_delta != 0.0 && fabs(delta/last_delta) < eps) + break; + + value += delta + delta; + last_delta = delta; + i++; + } + + return value * sqrt(n) * constants::root_half_pi() / (x2n*x2n); +} + +// d/dx (theta4(0, 2*x*x*n/M_PI)) +template +inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Policy&) { + BOOST_MATH_STD_USING + RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0); + RealType eps = policies::get_epsilon(); + int i = 1; + while (1) { + delta = 8*x*i*i*exp(-2*i*i*x*x*n); + + if (delta == 0.0) + break; + + if (last_delta != 0.0 && fabs(delta / last_delta) < eps) + break; + + if (i%2 == 0) + delta = -delta; + + value += delta; + last_delta = delta; + i++; + } + + return value * n; +} + +}; // detail + +template > + class kolmogorov_smirnov_distribution +{ + public: + typedef RealType value_type; + typedef Policy policy_type; + + // Constructor + kolmogorov_smirnov_distribution( RealType n ) : n_obs_(n) + { + RealType result; + detail::check_df( + "boost::math::kolmogorov_smirnov_distribution<%1%>::kolmogorov_smirnov_distribution", n_obs_, &result, Policy()); + } + + RealType number_of_observations()const + { + return n_obs_; + } + + private: + + RealType n_obs_; // positive integer +}; + +typedef kolmogorov_smirnov_distribution kolmogorov_k; // Convenience typedef for double version. + +namespace detail { +template +struct kolmogorov_smirnov_quantile_functor +{ + kolmogorov_smirnov_quantile_functor(const boost::math::kolmogorov_smirnov_distribution dist, RealType const& p) + : distribution(dist), prob(p) + { + } + + boost::math::tuple operator()(RealType const& x) + { + RealType fx = cdf(distribution, x) - prob; // Difference cdf - value - to minimize. + RealType dx = pdf(distribution, x); // pdf is 1st derivative. + // return both function evaluation difference f(x) and 1st derivative f'(x). + return boost::math::make_tuple(fx, dx); + } +private: + const boost::math::kolmogorov_smirnov_distribution distribution; + RealType prob; +}; + +template +struct kolmogorov_smirnov_complementary_quantile_functor +{ + kolmogorov_smirnov_complementary_quantile_functor(const boost::math::kolmogorov_smirnov_distribution dist, RealType const& p) + : distribution(dist), prob(p) + { + } + + boost::math::tuple operator()(RealType const& x) + { + RealType fx = cdf(complement(distribution, x)) - prob; // Difference cdf - value - to minimize. + RealType dx = -pdf(distribution, x); // pdf is the negative of the derivative of (1-CDF) + // return both function evaluation difference f(x) and 1st derivative f'(x). + return boost::math::make_tuple(fx, dx); + } +private: + const boost::math::kolmogorov_smirnov_distribution distribution; + RealType prob; +}; + +template +struct kolmogorov_smirnov_negative_pdf_functor +{ + RealType operator()(RealType const& x) { + if (2*x*x < constants::pi()) { + return -kolmogorov_smirnov_pdf_small_x(x, static_cast(1), Policy()); + } + return -kolmogorov_smirnov_pdf_large_x(x, static_cast(1), Policy()); + } +}; +} // namespace detail + +template +inline const std::pair range(const kolmogorov_smirnov_distribution& /*dist*/) +{ // Range of permissible values for random variable x. + using boost::math::tools::max_value; + return std::pair(static_cast(0), max_value()); +} + +template +inline const std::pair support(const kolmogorov_smirnov_distribution& /*dist*/) +{ // Range of supported values for random variable x. + // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero. + // In the exact distribution, the upper limit would be 1. + using boost::math::tools::max_value; + return std::pair(static_cast(0), max_value()); +} + +template +inline RealType pdf(const kolmogorov_smirnov_distribution& dist, const RealType& x) +{ + BOOST_FPU_EXCEPTION_GUARD + BOOST_MATH_STD_USING // for ADL of std functions. + + RealType n = dist.number_of_observations(); + RealType error_result; + static const char* function = "boost::math::pdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + if (x < 0 || !(boost::math::isfinite)(x)) + { + return policies::raise_domain_error( + function, "Kolmogorov-Smirnov parameter was %1%, but must be > 0 !", x, Policy()); + } + + if (2*x*x*n < constants::pi()) { + return detail::kolmogorov_smirnov_pdf_small_x(x, n, Policy()); + } + + return detail::kolmogorov_smirnov_pdf_large_x(x, n, Policy()); +} // pdf + +template +inline RealType cdf(const kolmogorov_smirnov_distribution& dist, const RealType& x) +{ + BOOST_MATH_STD_USING // for ADL of std function exp. + static const char* function = "boost::math::cdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + RealType error_result; + RealType n = dist.number_of_observations(); + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + if((x < 0) || !(boost::math::isfinite)(x)) { + return policies::raise_domain_error( + function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); + } + + if (x*x*n == 0) + return 0; + + return jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); +} // cdf + +template +inline RealType cdf(const complemented2_type, RealType>& c) { + BOOST_MATH_STD_USING // for ADL of std function exp. + RealType x = c.param; + static const char* function = "boost::math::cdf(const complemented2_type&, %1%>)"; + RealType error_result; + kolmogorov_smirnov_distribution const& dist = c.dist; + RealType n = dist.number_of_observations(); + + if(false == detail::check_x_not_NaN(function, x, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + if((x < 0) || !(boost::math::isfinite)(x)) + return policies::raise_domain_error( + function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy()); + + if (x*x*n == 0) + return 1; + + if (2*x*x*n > constants::pi()) + return -jacobi_theta4m1tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); + + return RealType(1) - jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi(), Policy()); +} // cdf (complemented) + +template +inline RealType quantile(const kolmogorov_smirnov_distribution& dist, const RealType& p) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + // Error check: + RealType error_result; + RealType n = dist.number_of_observations(); + if(false == detail::check_probability(function, p, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + RealType k = detail::kolmogorov_smirnov_quantile_guess(p) / sqrt(n); + const int get_digits = policies::digits();// get digits from policy, + boost::uintmax_t m = policies::get_max_root_iterations(); // and max iterations. + + return tools::newton_raphson_iterate(detail::kolmogorov_smirnov_quantile_functor(dist, p), + k, RealType(0), boost::math::tools::max_value(), get_digits, m); +} // quantile + +template +inline RealType quantile(const complemented2_type, RealType>& c) { + BOOST_MATH_STD_USING + static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)"; + kolmogorov_smirnov_distribution const& dist = c.dist; + RealType n = dist.number_of_observations(); + // Error check: + RealType error_result; + RealType p = c.param; + + if(false == detail::check_probability(function, p, &error_result, Policy())) + return error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + RealType k = detail::kolmogorov_smirnov_quantile_guess(RealType(1-p)) / sqrt(n); + + const int get_digits = policies::digits();// get digits from policy, + boost::uintmax_t m = policies::get_max_root_iterations(); // and max iterations. + + return tools::newton_raphson_iterate( + detail::kolmogorov_smirnov_complementary_quantile_functor(dist, p), + k, RealType(0), boost::math::tools::max_value(), get_digits, m); +} // quantile (complemented) + +template +inline RealType mode(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::mode(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + + std::pair r = boost::math::tools::brent_find_minima( + detail::kolmogorov_smirnov_negative_pdf_functor(), + static_cast(0), static_cast(1), policies::digits()); + return r.first / sqrt(n); +} + +// Mean and variance come directly from +// https://www.jstatsoft.org/article/view/v008i18 Section 3 +template +inline RealType mean(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::mean(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return constants::root_half_pi() * constants::ln_two() / sqrt(n); +} + +template +inline RealType variance(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::variance(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return (constants::pi_sqr_div_six() + - constants::pi() * constants::ln_two() * constants::ln_two()) / (2*n); +} + +// Skewness and kurtosis come from integrating the PDF +// The alternating series pops out a Dirichlet eta function which is related to the zeta function +template +inline RealType skewness(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::skewness(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + RealType ex3 = RealType(0.5625) * constants::root_half_pi() * constants::zeta_three() / n / sqrt(n); + RealType mean = boost::math::mean(dist); + RealType var = boost::math::variance(dist); + return (ex3 - 3 * mean * var - mean * mean * mean) / var / sqrt(var); +} + +template +inline RealType kurtosis(const kolmogorov_smirnov_distribution& dist) +{ + BOOST_MATH_STD_USING + static const char* function = "boost::math::kurtosis(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + RealType ex4 = 7 * constants::pi_sqr_div_six() * constants::pi_sqr_div_six() / 20 / n / n; + RealType mean = boost::math::mean(dist); + RealType var = boost::math::variance(dist); + RealType skew = boost::math::skewness(dist); + return (ex4 - 4 * mean * skew * var * sqrt(var) - 6 * mean * mean * var - mean * mean * mean * mean) / var / var; +} + +template +inline RealType kurtosis_excess(const kolmogorov_smirnov_distribution& dist) +{ + static const char* function = "boost::math::kurtosis_excess(const kolmogorov_smirnov_distribution<%1%>&)"; + RealType n = dist.number_of_observations(); + RealType error_result; + if(false == detail::check_df(function, n, &error_result, Policy())) + return error_result; + return kurtosis(dist) - 3; +} +}} +#endif diff --git a/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp new file mode 100644 index 000000000..04261a03e --- /dev/null +++ b/reporting/accuracy/plot_kolmogorov_smirnov_pdf.cpp @@ -0,0 +1,38 @@ +// Copyright Evan Miller 2020 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +using boost::math::tools::ulps_plot; + +int main() { + using PreciseReal = long double; + using CoarseReal = float; + + boost::math::kolmogorov_smirnov_distribution dist_coarse(10); + auto pdf_coarse = [&, dist_coarse](CoarseReal x) { + return boost::math::pdf(dist_coarse, x); + }; + boost::math::kolmogorov_smirnov_distribution dist_precise(10); + auto pdf_precise = [&, dist_precise](PreciseReal x) { + return boost::math::pdf(dist_precise, x); + }; + + int samples = 2500; + int width = 800; + PreciseReal clip = 100; + + std::string filename1 = "kolmogorov_smirnov_pdf_" + boost::core::demangle(typeid(CoarseReal).name()) + ".svg"; + auto plot1 = ulps_plot(pdf_precise, 0.0, 1.0, samples); + plot1.clip(clip).width(width); + std::string title1 = "Kolmogorov-Smirnov PDF (N=10) ULP plot at " + boost::core::demangle(typeid(CoarseReal).name()) + " precision"; + plot1.title(title1); + plot1.vertical_lines(10); + plot1.add_fn(pdf_coarse); + plot1.write(filename1); +} diff --git a/reporting/performance/test_distributions.cpp b/reporting/performance/test_distributions.cpp index ddd72ee96..2b54d4760 100644 --- a/reporting/performance/test_distributions.cpp +++ b/reporting/performance/test_distributions.cpp @@ -436,6 +436,16 @@ int main() test_boost_2_param(inverse_gaussian); + distribution_tester kolmogorov("KolmogorovSmirnov"); + kolmogorov.add_test_case(3, one_param_quantile >()); + kolmogorov.add_test_case(20, one_param_quantile >()); + kolmogorov.add_test_case(200, one_param_quantile >()); + kolmogorov.add_test_case(2000, one_param_quantile >()); + kolmogorov.add_test_case(20000, one_param_quantile >()); + kolmogorov.add_test_case(200000, one_param_quantile >()); + + test_boost_1_param(kolmogorov); + distribution_tester laplace("Laplace"); laplace.add_test_case(0, 1, two_param_quantile >()); laplace.add_test_case(20, 20, two_param_quantile >()); diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 9110a0130..60dc9c154 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -630,6 +630,7 @@ test-suite distribution_tests : [ run test_inverse_chi_squared_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gamma_distribution.cpp ../../test/build//boost_unit_test_framework ] [ run test_inverse_gaussian.cpp ../../test/build//boost_unit_test_framework ] + [ run test_kolmogorov_smirnov.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_auto_declarations cxx11_lambdas cxx11_unified_initialization_syntax cxx11_smart_ptr ] ] [ run test_laplace.cpp ../../test/build//boost_unit_test_framework ] [ run test_inv_hyp.cpp pch ../../test/build//boost_unit_test_framework ] [ run test_logistic_dist.cpp ../../test/build//boost_unit_test_framework ] diff --git a/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp new file mode 100644 index 000000000..9bfe384ac --- /dev/null +++ b/test/compile_test/dist_kolmogorov_smirnov_incl_test.cpp @@ -0,0 +1,25 @@ +// Copyright Evan Miller 2020 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Basic sanity check that header +// #includes all the files that it needs to. +// +#include +// +// Note this header includes no other headers, this is +// important if this test is to be meaningful: +// +#include "test_compile_result.hpp" + +void compile_and_link_test() +{ + TEST_DIST_FUNC(kolmogorov_smirnov) +} + +template class boost::math::kolmogorov_smirnov_distribution >; +template class boost::math::kolmogorov_smirnov_distribution >; +#ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS +template class boost::math::kolmogorov_smirnov_distribution >; +#endif diff --git a/test/compile_test/distribution_concept_check.cpp b/test/compile_test/distribution_concept_check.cpp index 26b236a55..fc8d2fab8 100644 --- a/test/compile_test/distribution_concept_check.cpp +++ b/test/compile_test/distribution_concept_check.cpp @@ -33,6 +33,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); diff --git a/test/compile_test/instantiate.hpp b/test/compile_test/instantiate.hpp index be9dae129..16bd7ece0 100644 --- a/test/compile_test/instantiate.hpp +++ b/test/compile_test/instantiate.hpp @@ -76,6 +76,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); @@ -111,6 +112,7 @@ void instantiate(RealType) function_requires > >(); function_requires > >(); function_requires > >(); + function_requires > >(); function_requires > >(); function_requires > >(); function_requires > >(); diff --git a/test/test_kolmogorov_smirnov.cpp b/test/test_kolmogorov_smirnov.cpp new file mode 100644 index 000000000..1d949fd6a --- /dev/null +++ b/test/test_kolmogorov_smirnov.cpp @@ -0,0 +1,102 @@ +// Copyright Evan Miller 2020 +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt +// or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#include +#include + +#define BOOST_TEST_MAIN +#include // for test_main +#include // for BOOST_CHECK_CLOSE +#include +#include + +template // Any floating-point type RealType. +void test_spots(RealType) +{ + using namespace boost::math; + // Test quantiles, CDFs, and complements + RealType eps = tools::epsilon(); + RealType tol = tools::epsilon() * 25; + for (int n=10; n<100; n += 10) { + kolmogorov_smirnov_distribution dist(n); + for (int i=0; i<1000; i++) { + RealType p = 1.0 * (i+1) / 1001; + RealType crit1 = quantile(dist, 1 - p); + RealType crit2 = quantile(complement(dist, p)); + RealType p1 = cdf(dist, crit1); + BOOST_CHECK_CLOSE_FRACTION(crit1, crit2, tol); + BOOST_CHECK_CLOSE_FRACTION(1 - p, p1, tol); + } + + for (int i=0; i<1000; i++) { + RealType x = 1.0 * (i+1) / 1001; + RealType p = cdf(dist, x); + RealType p1 = cdf(complement(dist, x)); + RealType x1; + if (p < 0.5) + x1 = quantile(dist, p); + else + x1 = quantile(complement(dist, p1)); + if (p > tol && p1 > tol) // skip the extreme tails + BOOST_CHECK_CLOSE_FRACTION(x, x1, tol); + } + } + + kolmogorov_smirnov_distribution dist(100); + + // Basics + BOOST_CHECK_THROW(pdf(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(cdf(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(quantile(dist, RealType(-1.0)), std::domain_error); + BOOST_CHECK_THROW(quantile(dist, RealType(2.0)), std::domain_error); + + // Confirm mode is at least a local minimum + RealType mode = boost::math::mode(dist); + + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode - sqrt(eps))); + BOOST_TEST_CHECK(pdf(dist, mode) >= pdf(dist, mode + sqrt(eps))); + + // Test the moments - each one integrates the entire distribution + quadrature::exp_sinh integrator; + + auto f_one = [&, dist](RealType t) { return pdf(dist, t); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_one, eps), RealType(1), tol); + + RealType mean = boost::math::mean(dist); + auto f_mean = [&, dist](RealType t) { return pdf(dist, t) * t; }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_mean, eps), mean, tol); + + RealType var = variance(dist); + auto f_var = [&, dist, mean](RealType t) { return pdf(dist, t) * (t - mean) * (t - mean); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_var, eps), var, tol); + + RealType skew = skewness(dist); + auto f_skew = [&, dist, mean, var](RealType t) { return pdf(dist, t) + * (t - mean) * (t - mean) * (t - mean) / var / sqrt(var); }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_skew, eps), skew, 10*tol); + + RealType kurt = kurtosis(dist); + auto f_kurt= [&, dist, mean, var](RealType t) { return pdf(dist, t) + * (t - mean) * (t - mean) * (t - mean) * (t - mean) / var / var; }; + BOOST_CHECK_CLOSE_FRACTION(integrator.integrate(f_kurt, eps), kurt, 5*tol); + + BOOST_CHECK_CLOSE_FRACTION(kurt, kurtosis_excess(dist) + 3, eps); +} + +BOOST_AUTO_TEST_CASE( test_main ) +{ + BOOST_MATH_CONTROL_FP; + + // (Parameter value, arbitrarily zero, only communicates the floating point type). + test_spots(0.0F); // Test float. + test_spots(0.0); // Test double. +#ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS + test_spots(0.0L); // Test long double. +#if !defined(BOOST_MATH_NO_REAL_CONCEPT_TESTS) + test_spots(boost::math::concepts::real_concept(0.)); // Test real concept. +#endif +#endif +}