diff --git a/doc/differentiation/lanczos_smoothing.qbk b/doc/differentiation/lanczos_smoothing.qbk
index 89e88eb67..e79da64cc 100644
--- a/doc/differentiation/lanczos_smoothing.qbk
+++ b/doc/differentiation/lanczos_smoothing.qbk
@@ -27,8 +27,6 @@ namespace boost::math::differentiation {
         template<class RandomAccessContainer>
         RandomAccessContainer operator()(RandomAccessContainer const & v) const;
 
-        void set_spacing(Real spacing);
-
         Real get_spacing() const;
     };
 
@@ -132,7 +130,11 @@ We can see that the smoothing derivative tracks the increase and decrease in the
 The computation of the filters is ill-conditioned for large /p/.
 In order to mitigate this problem, we have computed the filters in higher precision and cast the results to the desired type.
 However, this simply pushes the problem to larger /p/.
-In practice, this is not a problem, as large /p/ corresponds to less powerful denoising.
+In practice, this is not a problem, as large /p/ corresponds to less powerful denoising, but keep it in mind.
+
+In addition, the `-ffast-math` flag has a very large effect on the speed of these functions.
+In our benchmarks, they were 50% faster with the flag enabled, which is much larger than the usual performance increases we see by turning on this flag.
+Hence, if the default performance is not sufficient, this flag is available, though it comes with its own problems.
 
 [heading References]
 
diff --git a/include/boost/math/differentiation/lanczos_smoothing.hpp b/include/boost/math/differentiation/lanczos_smoothing.hpp
index 0d73aa76a..5c237deed 100644
--- a/include/boost/math/differentiation/lanczos_smoothing.hpp
+++ b/include/boost/math/differentiation/lanczos_smoothing.hpp
@@ -283,12 +283,16 @@ public:
                 m_f.resize(interior.size());
                 for (size_t j = 0; j < interior.size(); ++j)
                 {
-                    m_f[j] = static_cast<Real>(interior[j]);
+                    m_f[j] = static_cast<Real>(interior[j])/m_dt;
                 }
             }
             else
             {
                 m_f = detail::interior_velocity_filter<Real>(n, approximation_order);
+                for (auto & x : m_f)
+                {
+                    x /= m_dt;
+                }
             }
 
             m_boundary_filters.resize(n);
@@ -303,13 +307,17 @@ public:
                     m_boundary_filters[i].resize(bf.size());
                     for (size_t j = 0; j < bf.size(); ++j)
                     {
-                        m_boundary_filters[i][j] = static_cast<Real>(bf[j]);
+                        m_boundary_filters[i][j] = static_cast<Real>(bf[j])/m_dt;
                     }
                 }
                 else
                 {
                     int64_t s = static_cast<int64_t>(i) - static_cast<int64_t>(n);
                     m_boundary_filters[i] = detail::boundary_velocity_filter<Real>(n, approximation_order, s);
+                    for (auto & bf : m_boundary_filters[i])
+                    {
+                        bf /= m_dt;
+                    }
                 }
             }
         }
@@ -326,7 +334,7 @@ public:
                 m_f.resize(n+1);
                 for (size_t i = 0; i < m_f.size(); ++i)
                 {
-                    m_f[i] = static_cast<Real>(f[i+n]);
+                    m_f[i] = static_cast<Real>(f[i+n])/(m_dt*m_dt);
                 }
                 m_boundary_filters.resize(n);
                 for (size_t i = 0; i < n; ++i)
@@ -336,7 +344,7 @@ public:
                     m_boundary_filters[i].resize(bf.size());
                     for (size_t j = 0; j < bf.size(); ++j)
                     {
-                        m_boundary_filters[i][j] = static_cast<Real>(bf[j]);
+                        m_boundary_filters[i][j] = static_cast<Real>(bf[j])/(m_dt*m_dt);
                     }
                 }
             }
@@ -348,13 +356,17 @@ public:
                 m_f.resize(n+1);
                 for (size_t i = 0; i < m_f.size(); ++i)
                 {
-                    m_f[i] = f[i+n];
+                    m_f[i] = f[i+n]/(m_dt*m_dt);
                 }
                 m_boundary_filters.resize(n);
                 for (size_t i = 0; i < n; ++i)
                 {
                     int64_t s = static_cast<int64_t>(i) - static_cast<int64_t>(n);
                     m_boundary_filters[i] = detail::acceleration_filter<Real>(n, approximation_order, s);
+                    for (auto & bf : m_boundary_filters[i])
+                    {
+                        bf /= (m_dt*m_dt);
+                    }
                 }
             }
         }
@@ -364,12 +376,6 @@ public:
         }
     }
 
-    void set_spacing(Real const & spacing)
-    {
-        BOOST_ASSERT_MSG(spacing > 0, "Spacing between samples must be > 0.");
-        m_dt = spacing;
-    }
-
     Real get_spacing() const
     {
         return m_dt;
@@ -388,72 +394,73 @@ public:
         {
             if (i >= m_f.size() - 1 && i <= size(v) - m_f.size())
             {
-                Real dv = 0;
-                for (size_t j = 1; j < m_f.size(); ++j)
+                // The filter has length >= 1:
+                Real dvdt = m_f[1] * (v[i + 1] - v[i - 1]);
+                for (size_t j = 2; j < m_f.size(); ++j)
                 {
-                    dv += m_f[j] * (v[i + j] - v[i - j]);
+                    dvdt += m_f[j] * (v[i + j] - v[i - j]);
                 }
-                return dv / m_dt;
+                return dvdt;
             }
 
             // m_f.size() = N+1
             if (i < m_f.size() - 1)
             {
                 auto &bf = m_boundary_filters[i];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0]*v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    dv += bf[j] * v[j];
+                    dvdt += bf[j] * v[j];
                 }
-                return dv / m_dt;
+                return dvdt;
             }
 
             if (i > size(v) - m_f.size() && i < size(v))
             {
                 int k = size(v) - 1 - i;
                 auto &bf = m_boundary_filters[k];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0]*v[size(v)-1];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    dv += bf[j] * v[size(v) - 1 - j];
+                    dvdt += bf[j] * v[size(v) - 1 - j];
                 }
-                return -dv / m_dt;
+                return -dvdt;
             }
         }
         else if constexpr (order==2)
         {
             if (i >= m_f.size() - 1 && i <= size(v) - m_f.size())
             {
-                Real d2v = m_f[0]*v[i];
+                Real d2vdt2 = m_f[0]*v[i];
                 for (size_t j = 1; j < m_f.size(); ++j)
                 {
-                    d2v += m_f[j] * (v[i + j] + v[i - j]);
+                    d2vdt2 += m_f[j] * (v[i + j] + v[i - j]);
                 }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
             }
 
             // m_f.size() = N+1
             if (i < m_f.size() - 1)
             {
                 auto &bf = m_boundary_filters[i];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0]*v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    d2v += bf[j] * v[j];
+                    d2vdt2 += bf[j] * v[j];
                 }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
             }
 
             if (i > size(v) - m_f.size() && i < size(v))
             {
                 int k = size(v) - 1 - i;
                 auto &bf = m_boundary_filters[k];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0] * v[size(v) - 1];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    d2v += bf[j] * v[size(v) - 1 - j];
+                    d2vdt2 += bf[j] * v[size(v) - 1 - j];
                 }
-                return d2v / (m_dt*m_dt);
+                return d2vdt2;
             }
         }
 
@@ -477,22 +484,22 @@ public:
             for (size_t i = 0; i < m_f.size() - 1; ++i)
             {
                 auto &bf = m_boundary_filters[i];
-                Real dv = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real dvdt = bf[0] * v[0];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    dv += bf[j] * v[j];
+                    dvdt += bf[j] * v[j];
                 }
-                w[i] = dv / m_dt;
+                w[i] = dvdt;
             }
 
             for(size_t i = m_f.size() - 1; i <= size(v) - m_f.size(); ++i)
             {
-                Real dv = 0;
-                for (size_t j = 1; j < m_f.size(); ++j)
+                Real dvdt = m_f[1] * (v[i + 1] - v[i - 1]);
+                for (size_t j = 2; j < m_f.size(); ++j)
                 {
-                    dv += m_f[j] * (v[i + j] - v[i - j]);
+                    dvdt += m_f[j] *(v[i + j] - v[i - j]);
                 }
-                w[i] = dv / m_dt;
+                w[i] = dvdt;
             }
 
 
@@ -500,12 +507,12 @@ public:
             {
                 int k = size(v) - 1 - i;
                 auto &f = m_boundary_filters[k];
-                Real dv = 0;
-                for (size_t j = 0; j < f.size(); ++j)
+                Real dvdt = f[0] * v[size(v) - 1];;
+                for (size_t j = 1; j < f.size(); ++j)
                 {
-                    dv += f[j] * v[size(v) - 1 - j];
+                    dvdt += f[j] * v[size(v) - 1 - j];
                 }
-                w[i] = -dv / m_dt;
+                w[i] = -dvdt;
             }
         }
         else if constexpr (order==2)
@@ -514,35 +521,34 @@ public:
             for (size_t i = 0; i < m_f.size() - 1; ++i)
             {
                 auto &bf = m_boundary_filters[i];
-                Real d2v = 0;
+                Real d2vdt2 = 0;
                 for (size_t j = 0; j < bf.size(); ++j)
                 {
-                    d2v += bf[j] * v[j];
+                    d2vdt2 += bf[j] * v[j];
                 }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
             }
 
             for (size_t i = m_f.size() - 1; i <= size(v) - m_f.size(); ++i)
             {
-                Real d2v = m_f[0]*v[i];
+                Real d2vdt2 = m_f[0]*v[i];
                 for (size_t j = 1; j < m_f.size(); ++j)
                 {
-                    d2v += m_f[j] * (v[i + j] + v[i - j]);
+                    d2vdt2 += m_f[j] * (v[i + j] + v[i - j]);
                 }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
             }
 
-
             for (size_t i = size(v) - m_f.size() + 1; i < size(v); ++i)
             {
                 int k = size(v) - 1 - i;
                 auto &bf = m_boundary_filters[k];
-                Real d2v = 0;
-                for (size_t j = 0; j < bf.size(); ++j)
+                Real d2vdt2 = bf[0] * v[size(v) - 1];
+                for (size_t j = 1; j < bf.size(); ++j)
                 {
-                    d2v += bf[j] * v[size(v) - 1 - j];
+                    d2vdt2 += bf[j] * v[size(v) - 1 - j];
                 }
-                w[i] = d2v / (m_dt*m_dt);
+                w[i] = d2vdt2;
             }
         }
 
diff --git a/test/lanczos_smoothing_test.cpp b/test/lanczos_smoothing_test.cpp
index 1303933fd..05716fec7 100644
--- a/test/lanczos_smoothing_test.cpp
+++ b/test/lanczos_smoothing_test.cpp
@@ -598,10 +598,13 @@ BOOST_AUTO_TEST_CASE(lanczos_smoothing_test)
     test_dlp_evaluation<double>();
     test_dlp_derivatives<double>();
     test_dlp_next<double>();
-    test_dlp_norms<cpp_bin_float_50>();
+
+    // Takes too long!
+    //test_dlp_norms<cpp_bin_float_50>();
     test_boundary_velocity_filters<double>();
     test_boundary_velocity_filters<long double>();
-    test_boundary_velocity_filters<cpp_bin_float_50>();
+    // Takes too long!
+    //test_boundary_velocity_filters<cpp_bin_float_50>();
     test_boundary_lanczos<double>();
     test_boundary_lanczos<long double>();
     // Takes too long!