Save an xorpd instruction by initializing not to zero, but to the first value. Save a division for each element by dividing the filters by the spacing (square of the spacing for the second derivative). [CI SKIP]

2026-01-19 04:22:09 +00:00 · 2019-01-26 21:34:16 -07:00
parent 0ca8faf3cc
commit a1cade5a90
3 changed files with 73 additions and 62 deletions
--- a/doc/differentiation/lanczos_smoothing.qbk
+++ b/doc/differentiation/lanczos_smoothing.qbk
@@ -27,8 +27,6 @@ namespace boost::math::differentiation {
        template<class RandomAccessContainer>
        RandomAccessContainer operator()(RandomAccessContainer const & v) const;

-        void set_spacing(Real spacing);
-
        Real get_spacing() const;
    };

@@ -132,7 +130,11 @@ We can see that the smoothing derivative tracks the increase and decrease in the
 The computation of the filters is ill-conditioned for large /p/.
 In order to mitigate this problem, we have computed the filters in higher precision and cast the results to the desired type.
 However, this simply pushes the problem to larger /p/.
-In practice, this is not a problem, as large /p/ corresponds to less powerful denoising.
+In practice, this is not a problem, as large /p/ corresponds to less powerful denoising, but keep it in mind.
+
+In addition, the `-ffast-math` flag has a very large effect on the speed of these functions.
+In our benchmarks, they were 50% faster with the flag enabled, which is much larger than the usual performance increases we see by turning on this flag.
+Hence, if the default performance is not sufficient, this flag is available, though it comes with its own problems.

 [heading References]

--- a/include/boost/math/differentiation/lanczos_smoothing.hpp
+++ b/include/boost/math/differentiation/lanczos_smoothing.hpp
@@ -283,12 +283,16 @@ public:
                m_f.resize(interior.size());
                for (size_t j = 0; j < interior.size(); ++j)
                {
-                    m_f[j] = static_cast<Real>(interior[j]);
+                    m_f[j] = static_cast<Real>(interior[j])/m_dt;
                }
            }
            else
            {
                m_f = detail::interior_velocity_filter<Real>(n, approximation_order);
+                for (auto & x : m_f)
+                {
+                    x /= m_dt;
+                }
            }

            m_boundary_filters.resize(n);
@@ -303,13 +307,17 @@ public:
                    m_boundary_filters[i].resize(bf.size());
                    for (size_t j = 0; j < bf.size(); ++j)
                    {
-                        m_boundary_filters[i][j] = static_cast<Real>(bf[j]);
+                        m_boundary_filters[i][j] = static_cast<Real>(bf[j])/m_dt;
                    }
                }
                else
                {
                    int64_t s = static_cast<int64_t>(i) - static_cast<int64_t>(n);
                    m_boundary_filters[i] = detail::boundary_velocity_filter<Real>(n, approximation_order, s);
+                    for (auto & bf : m_boundary_filters[i])
+                    {
+                        bf /= m_dt;
+                    }
                }
            }
        }
@@ -326,7 +334,7 @@ public:
                m_f.resize(n+1);
                for (size_t i = 0; i < m_f.size(); ++i)
                {
-                    m_f[i] = static_cast<Real>(f[i+n]);
+                    m_f[i] = static_cast<Real>(f[i+n])/(m_dt*m_dt);
                }
                m_boundary_filters.resize(n);
                for (size_t i = 0; i < n; ++i)
@@ -336,7 +344,7 @@ public:
                    m_boundary_filters[i].resize(bf.size());
                    for (size_t j = 0; j < bf.size(); ++j)
                    {
-                        m_boundary_filters[i][j] = static_cast<Real>(bf[j]);
+                        m_boundary_filters[i][j] = static_cast<Real>(bf[j])/(m_dt*m_dt);
                    }
                }
            }
@@ -348,13 +356,17 @@ public:
                m_f.resize(n+1);
                for (size_t i = 0; i < m_f.size(); ++i)
                {
-                    m_f[i] = f[i+n];
+                    m_f[i] = f[i+n]/(m_dt*m_dt);
                }
                m_boundary_filters.resize(n);
                for (size_t i = 0; i < n; ++i)
                {
                    int64_t s = static_cast<int64_t>(i) - static_cast<int64_t>(n);
                    m_boundary_filters[i] = detail::acceleration_filter<Real>(n, approximation_order, s);
+                    for (auto & bf : m_boundary_filters[i])
+                    {
+                        bf /= (m_dt*m_dt);
+                    }
                }
            }
        }
@@ -364,12 +376,6 @@ public:
        }
    }

-    void set_spacing(Real const & spacing)
-    {
-        BOOST_ASSERT_MSG(spacing > 0, "Spacing between samples must be > 0.");
-        m_dt = spacing;
-    }
-
    Real get_spacing() const
    {
        return m_dt;
@@ -388,72 +394,73 @@ public:
        {
            if (i >= m_f.size() - 1 && i <= size(v) - m_f.size())
            {
-                Real dv = 0;
-                for (size_t j = 1; j < m_f.size(); ++j)
+                // The filter has length >= 1:
+                Real dvdt = m_f[1] * (v[i + 1] - v[i - 1]);
+                for (size_t j = 2; j < m_f.size(); ++j)
                {
-                    dv += m_f[j] * (v[i + j] - v[i - j]);
+                    dvdt += m_f[j] * (v[i + j] - v[i - j]);
                }
-                return dv / m_dt;
+                return dvdt;
            }

            // m_f.size() = N+1
            if (i < m_f.size() - 1)
            {
                auto &bf = m_boundary_filters[i];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0]*v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    dv += bf[j] * v[j];
+                    dvdt += bf[j] * v[j];
                }
-                return dv / m_dt;
+                return dvdt;
            }

            if (i > size(v) - m_f.size() && i < size(v))
            {
                int k = size(v) - 1 - i;
                auto &bf = m_boundary_filters[k];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0]*v[size(v)-1];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    dv += bf[j] * v[size(v) - 1 - j];
+                    dvdt += bf[j] * v[size(v) - 1 - j];
                }
-                return -dv / m_dt;
+                return -dvdt;
            }
        }
        else if constexpr (order==2)
        {
            if (i >= m_f.size() - 1 && i <= size(v) - m_f.size())
            {
-                Real d2v = m_f[0]*v[i];
+                Real d2vdt2 = m_f[0]*v[i];
                for (size_t j = 1; j < m_f.size(); ++j)
                {
-                    d2v += m_f[j] * (v[i + j] + v[i - j]);
+                    d2vdt2 += m_f[j] * (v[i + j] + v[i - j]);
                }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
            }

            // m_f.size() = N+1
            if (i < m_f.size() - 1)
            {
                auto &bf = m_boundary_filters[i];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0]*v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    d2v += bf[j] * v[j];
+                    d2vdt2 += bf[j] * v[j];
                }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
            }

            if (i > size(v) - m_f.size() && i < size(v))
            {
                int k = size(v) - 1 - i;
                auto &bf = m_boundary_filters[k];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0] * v[size(v) - 1];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    d2v += bf[j] * v[size(v) - 1 - j];
+                    d2vdt2 += bf[j] * v[size(v) - 1 - j];
                }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
            }
        }

@@ -477,22 +484,22 @@ public:
            for (size_t i = 0; i < m_f.size() - 1; ++i)
            {
                auto &bf = m_boundary_filters[i];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0] * v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    dv += bf[j] * v[j];
+                    dvdt += bf[j] * v[j];
                }
-                w[i] = dv / m_dt;
+                w[i] = dvdt;
            }

            for(size_t i = m_f.size() - 1; i <= size(v) - m_f.size(); ++i)
            {
-                Real dv = 0;
-                for (size_t j = 1; j < m_f.size(); ++j)
+                Real dvdt = m_f[1] * (v[i + 1] - v[i - 1]);
+                for (size_t j = 2; j < m_f.size(); ++j)
                {
-                    dv += m_f[j] * (v[i + j] - v[i - j]);
+                    dvdt += m_f[j] *(v[i + j] - v[i - j]);
                }
-                w[i] = dv / m_dt;
+                w[i] = dvdt;
            }


@@ -500,12 +507,12 @@ public:
            {
                int k = size(v) - 1 - i;
                auto &f = m_boundary_filters[k];
-                Real dv = 0;
-                for (size_t j = 0; j < f.size(); ++j)
+                Real dvdt = f[0] * v[size(v) - 1];;
+                for (size_t j = 1; j < f.size(); ++j)
                {
-                    dv += f[j] * v[size(v) - 1 - j];
+                    dvdt += f[j] * v[size(v) - 1 - j];
                }
-                w[i] = -dv / m_dt;
+                w[i] = -dvdt;
            }
        }
        else if constexpr (order==2)
@@ -514,35 +521,34 @@ public:
            for (size_t i = 0; i < m_f.size() - 1; ++i)
            {
                auto &bf = m_boundary_filters[i];
-                Real d2v = 0;
+                Real d2vdt2 = 0;
                for (size_t j = 0; j < bf.size(); ++j)
                {
-                    d2v += bf[j] * v[j];
+                    d2vdt2 += bf[j] * v[j];
                }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
            }

            for (size_t i = m_f.size() - 1; i <= size(v) - m_f.size(); ++i)
            {
-                Real d2v = m_f[0]*v[i];
+                Real d2vdt2 = m_f[0]*v[i];
                for (size_t j = 1; j < m_f.size(); ++j)
                {
-                    d2v += m_f[j] * (v[i + j] + v[i - j]);
+                    d2vdt2 += m_f[j] * (v[i + j] + v[i - j]);
                }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
            }

-
            for (size_t i = size(v) - m_f.size() + 1; i < size(v); ++i)
            {
                int k = size(v) - 1 - i;
                auto &bf = m_boundary_filters[k];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0] * v[size(v) - 1];
+                for (size_t j = 1; j < bf.size(); ++j)
                {
-                    d2v += bf[j] * v[size(v) - 1 - j];
+                    d2vdt2 += bf[j] * v[size(v) - 1 - j];
                }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
            }
        }

--- a/test/lanczos_smoothing_test.cpp
+++ b/test/lanczos_smoothing_test.cpp
@@ -598,10 +598,13 @@ BOOST_AUTO_TEST_CASE(lanczos_smoothing_test)
    test_dlp_evaluation<double>();
    test_dlp_derivatives<double>();
    test_dlp_next<double>();
-    test_dlp_norms<cpp_bin_float_50>();
+
+    // Takes too long!
+    //test_dlp_norms<cpp_bin_float_50>();
    test_boundary_velocity_filters<double>();
    test_boundary_velocity_filters<long double>();
-    test_boundary_velocity_filters<cpp_bin_float_50>();
+    // Takes too long!
+    //test_boundary_velocity_filters<cpp_bin_float_50>();
    test_boundary_lanczos<double>();
    test_boundary_lanczos<long double>();
    // Takes too long!