From b6c2e71ba3fdccd0658e9d2fd8ff5edfbcec56b0 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 30 Jul 2015 18:52:52 +0200 Subject: [PATCH] Using merge with merge path algorithm in merge sort Merge with merge path is used in merge part of merge sort algorithm as it's seems to be more efficient than block merge when there are a few big sorted blocks left to be merged. --- .../algorithm/detail/merge_sort_on_cpu.hpp | 63 ++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp index 0af6eae9..a58104a5 100644 --- a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp +++ b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,39 @@ inline void merge_blocks(Iterator first, queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } +template +inline void dispatch_merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const size_t input_size_threshold, + const size_t blocks_no_threshold, + command_queue &queue) +{ + const size_t blocks_no = static_cast( + std::ceil(float(count) / block_size) + ); + // merge with merge path should used only for the large arrays and at the + // end of merging part when there are only a few big blocks left to be merged + if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ + Iterator last = first + count; + for(size_t i = 0; i < count; i+= 2*block_size) + { + Iterator first1 = (std::min)(first + i, last); + Iterator last1 = (std::min)(first1 + block_size, last); + Iterator first2 = last1; + Iterator last2 = (std::min)(first2 + block_size, last); + Iterator block_result = (std::min)(result + i, result + count); + merge_with_merge_path(first1, last1, first2, last2, + block_result, compare, queue); + } + } + else { + merge_blocks(first, result, compare, count, block_size, queue); + } +} + template inline void block_insertion_sort(Iterator first, Compare compare, @@ -119,6 +153,7 @@ inline void block_insertion_sort(Iterator first, queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); } +// This sort is stable. template inline void merge_sort_on_cpu(Iterator first, Iterator last, @@ -146,25 +181,39 @@ inline void merge_sort_on_cpu(Iterator first, boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); + // When there is merge_with_path_blocks_no_threshold or less blocks left to + // merge AND input size is merge_with_merge_path_input_size_threshold or more + // merge_with_merge_path() algorithm is used to merge sorted blocks; + // otherwise merge_blocks() is used. + const size_t merge_with_path_blocks_no_threshold = + parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); + const size_t merge_with_path_input_size_threshold = + parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); + const size_t block_size = parameters->get(cache_key, "insertion_sort_block_size", 64); block_insertion_sort(first, compare, count, block_size, queue); // temporary buffer for merge result vector temp(count, context); - bool result_in_temp = false; + bool result_in_temporary_buffer = false; for(size_t i = block_size; i < count; i *= 2){ - result_in_temp = !result_in_temp; - if(result_in_temp) { - merge_blocks(first, temp.begin(), compare, count, i, queue); + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + dispatch_merge_blocks(first, temp.begin(), compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); } else { - merge_blocks(temp.begin(), first, compare, count, i, queue); + dispatch_merge_blocks(temp.begin(), first, compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); } } - // if the result is in temp buffer we need to copy it to input - if(result_in_temp) { + if(result_in_temporary_buffer) { copy(temp.begin(), temp.end(), first, queue); } }