diff --git a/include/boost/math/special_functions/prime_sieve_jm.hpp b/include/boost/math/special_functions/prime_sieve_jm.hpp index 1a2b005b7..a83e5f168 100644 --- a/include/boost/math/special_functions/prime_sieve_jm.hpp +++ b/include/boost/math/special_functions/prime_sieve_jm.hpp @@ -163,10 +163,93 @@ inline bool linear_sieve_classical_segment(Container& primes, Sieve& masks, Inte return true; } + // 4096 is where benchmarked performance of linear_sieve begins to diverge template const Integer linear_sieve_limit = Integer(524288); // Constexpr does not work with boost::multiprecision types +template +inline bool linear_sieve_classical_segment_threaded(std::atomic* current_max_processed_value, std::mutex* lock, Container* primes, Integer start_offset, Integer end_offset, Integer stride, OutputIterator out, bool output_to_container) +{ + simple_bitset masks(linear_sieve_limit / 2); + + std::unique_lock l(*lock); + std::size_t prime_count = primes->size(); + l.unlock(); + + while (start_offset < end_offset) + { + Integer max_points = end_offset - start_offset > linear_sieve_limit ? linear_sieve_limit : end_offset - start_offset; + Integer limit = static_cast(std::sqrt(static_cast(start_offset + max_points)) + 1); + // Begin by striking out odd numbered multiples of all the primes we have so far. + // 1-based index, we only have odd numbers in the sieve so don't need to handle 2: + for (std::size_t i = 1; i < prime_count; ++i) + { + Integer prime = (*primes)[i]; + if (prime > limit) + break; + Integer index = prime - start_offset % prime; + if ((index & 1) == 0) + index += prime; + index >>= 1; + for (; index < max_points / 2; index += prime) + masks.clear(index); + } + // + // Now we must wait until the previous thread has completed the segment before this one: + // + while (current_max_processed_value->load() != start_offset) + std::this_thread::yield(); + // + // Maybe process all the primes we didn't have access to in the loop above: + // + if ((*primes)[prime_count - 1] < limit) + { + l.lock(); + prime_count = primes->size(); + l.unlock(); + // Begin by striking out odd numbered multiples of all the primes we have so far. + // 1-based index, we only have odd numbers in the sieve so don't need to handle 2: + for (std::size_t i = 1; i < prime_count; ++i) + { + Integer prime = (*primes)[i]; + if (prime > limit) + break; + Integer index = prime - start_offset % prime; + if ((index & 1) == 0) + index += prime; + index >>= 1; + for (; index < max_points / 2; index += prime) + masks.clear(index); + } + } + // + // Now walk through the sieve outputting primes. + // + l.lock(); + for (Integer index = 0; index < max_points / 2; ++index) + { + if (masks.test(index)) + { + *out++ = start_offset + 2 * index + 1; + if (output_to_container) + primes->push_back(start_offset + 2 * index + 1); + if (has_output_iterator_terminated(out)) + return false; + } + } + prime_count = primes->size(); + l.unlock(); + // + // We're done on this segment, signal the next thread: + // + masks.reset(); + *current_max_processed_value = start_offset + max_points; + start_offset += stride; + } + return true; +} + template void prime_sieve_imp(ExecutionPolicy&& policy, Integer upper_bound, Container& primes, OutputIterator out, bool output_to_container) { @@ -203,25 +286,34 @@ void prime_sieve_imp(ExecutionPolicy&& policy, Integer upper_bound, Container& p return; } } -#if 0 - else { - std::vector small_primes{}; - small_primes.reserve(1028); + unsigned hardware_concurrency = std::thread::hardware_concurrency(); + if ((hardware_concurrency == 0) || (upper_bound <= linear_sieve_limit * 2)) + { + // + // No point in using threads as there's only one more segment to process: + // + linear_sieve_classical_segment(primes, sieve, linear_sieve_limit, upper_bound - linear_sieve_limit, out, output_to_container); + } + else + { + unsigned n_threads = (upper_bound - linear_sieve_limit) / linear_sieve_limit +((upper_bound - linear_sieve_limit) % linear_sieve_limit ? 1 : 0); + n_threads = (std::min)(n_threads, hardware_concurrency / 2); - std::thread t1([&small_primes] { - boost::math::detail::linear_sieve(static_cast(linear_sieve_limit * 2), small_primes); - }); - std::thread t2([upper_bound, &primes] { - boost::math::detail::segmented_sieve(static_cast(linear_sieve_limit * 2), upper_bound, primes); - }); + std::atomic current_max_processed_value = linear_sieve_limit; + std::vector> threads(n_threads); + std::mutex mutex; - t1.join(); - t2.join(); - primes.insert(primes.begin(), small_primes.begin(), small_primes.end()); + for (unsigned i = 0; i < n_threads; ++i) + threads[i].reset(new std::thread(linear_sieve_classical_segment_threaded, + ¤t_max_processed_value, &mutex, &primes, linear_sieve_limit * (i + 1), upper_bound, + linear_sieve_limit * n_threads, out, output_to_container)); + + for (unsigned i = 0; i < n_threads; ++i) + threads[i]->join(); + } } -#endif } template diff --git a/reporting/performance/prime_sieve_performance_jm.cpp b/reporting/performance/prime_sieve_performance_jm.cpp index a0d1697b7..6fcee3b6c 100644 --- a/reporting/performance/prime_sieve_performance_jm.cpp +++ b/reporting/performance/prime_sieve_performance_jm.cpp @@ -66,6 +66,19 @@ void prime_sieve_seq(benchmark::State& state) state.SetComplexityN(state.range(0)); } template +void prime_sieve_par(benchmark::State& state) +{ + Integer upper = static_cast(state.range(0)); + std::vector primes; + boost::math::prime_reserve(upper, primes); + for(auto _ : state) + { + primes.clear(); + boost::math::prime_sieve(std::execution::par, upper, primes); + } + state.SetComplexityN(state.range(0)); +} +template void prime_sieve_seq_jm(benchmark::State& state) { Integer upper = static_cast(state.range(0)); @@ -79,6 +92,19 @@ void prime_sieve_seq_jm(benchmark::State& state) state.SetComplexityN(state.range(0)); } template +void prime_sieve_seq_jm_par(benchmark::State& state) +{ + Integer upper = static_cast(state.range(0)); + std::vector primes; + boost::math::prime_reserve(upper, primes); + for(auto _ : state) + { + primes.clear(); + jm::prime_sieve(std::execution::par, upper, primes); + } + state.SetComplexityN(state.range(0)); +} +template void prime_sieve_seq_jm_oi(benchmark::State& state) { Integer upper = static_cast(state.range(0)); @@ -160,11 +186,13 @@ constexpr uint64_t high_range = uint64_t(1) << 32; // Complete Implemenations //BENCHMARK_TEMPLATE(prime_sieve, int32_t)->RangeMultiplier(2)->Range(1 << 1, 1 << 30)->Complexity(benchmark::oN)->UseRealTime(); //BENCHMARK_TEMPLATE(prime_sieve, int64_t)->RangeMultiplier(2)->Range(1 << 1, 1 << 30)->Complexity(benchmark::oN)->UseRealTime(); -BENCHMARK_TEMPLATE(kimwalish_primes, int64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); -BENCHMARK_TEMPLATE(prime_sieve_seq, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); -BENCHMARK_TEMPLATE(prime_sieve_seq_oi, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); -BENCHMARK_TEMPLATE(prime_sieve_seq_jm, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); -BENCHMARK_TEMPLATE(prime_sieve_seq_jm_oi, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); +BENCHMARK_TEMPLATE(kimwalish_primes, int64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN)->UseRealTime(); +BENCHMARK_TEMPLATE(prime_sieve_seq, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN)->UseRealTime(); +BENCHMARK_TEMPLATE(prime_sieve_par, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN)->UseRealTime(); +//BENCHMARK_TEMPLATE(prime_sieve_seq_oi, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); +BENCHMARK_TEMPLATE(prime_sieve_seq_jm, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN)->UseRealTime(); +BENCHMARK_TEMPLATE(prime_sieve_seq_jm_par, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN)->UseRealTime(); +//BENCHMARK_TEMPLATE(prime_sieve_seq_jm_oi, uint64_t)->RangeMultiplier(2)->Range(low_range, high_range)->Complexity(benchmark::oN); //BENCHMARK_TEMPLATE(prime_sieve, boost::multiprecision::cpp_int)->RangeMultiplier(2)->Range(1 << 1, 1 << 30)->Complexity(benchmark::oN)->UseRealTime(); //BENCHMARK_TEMPLATE(prime_sieve, boost::multiprecision::mpz_int)->RangeMultiplier(2)->Range(1 << 1, 1 << 30)->Complexity(benchmark::oN)->UseRealTime();