From e357a6a338c08da0ef2d25470120242b75acf351 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Tue, 18 Aug 2015 21:26:34 +0200 Subject: [PATCH 1/3] Test if min/max_element algorithms return the first min/max in given range --- perf/perf_max_element.cpp | 28 ++++++++++++++++++++++------ test/test_extrema.cpp | 16 ++++++++++------ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/perf/perf_max_element.cpp b/perf/perf_max_element.cpp index 66354885..0e47c67e 100644 --- a/perf/perf_max_element.cpp +++ b/perf/perf_max_element.cpp @@ -47,28 +47,44 @@ int main(int argc, char *argv[]) queue ); - boost::compute::vector::iterator max = device_vector.begin(); + boost::compute::vector::iterator device_max_iter + = device_vector.begin(); + perf_timer t; for(size_t trial = 0; trial < PERF_TRIALS; trial++){ t.start(); - max = boost::compute::max_element( + device_max_iter = boost::compute::max_element( device_vector.begin(), device_vector.end(), queue ); queue.finish(); t.stop(); } - int device_max = max.read(queue); + int device_max = device_max_iter.read(queue); std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; std::cout << "max: " << device_max << std::endl; // verify max is correct - int host_max = *std::max_element(host_vector.begin(), host_vector.end()); + std::vector::iterator host_max_iter + = std::max_element(host_vector.begin(), host_vector.end()); + + int host_max = *host_max_iter; if(device_max != host_max){ + std::cout << "ERROR: " + << "device_max (" << device_max << ") " + << "!= " + << "host_max (" << host_max << ")" + << std::endl; + return -1; + } + + size_t host_max_idx = std::distance(host_vector.begin(), host_max_iter); + size_t device_max_idx = std::distance(device_vector.begin(), device_max_iter); + if(device_max_idx != host_max_idx){ std::cout << "ERROR: " - << "device_max (" << device_max << ") " + << "device_max index (" << device_max_idx << ") " << "!= " - << "host_max (" << host_max << ")" + << "host_max index (" << host_max_idx << ")" << std::endl; return -1; } diff --git a/test/test_extrema.cpp b/test/test_extrema.cpp index 98804bce..dd5fa474 100644 --- a/test/test_extrema.cpp +++ b/test/test_extrema.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -25,18 +26,21 @@ BOOST_AUTO_TEST_CASE(int_min_max) { - int data[] = { 9, 15, 1, 4 }; - boost::compute::vector vector(data, data + 4, queue); + boost::compute::vector vector(size_t(4096), int(0), queue); + boost::compute::iota(vector.begin(), (vector.begin() + 512), 1, queue); + boost::compute::fill((vector.end() - 512), vector.end(), 513, queue); boost::compute::vector::iterator min_iter = boost::compute::min_element(vector.begin(), vector.end(), queue); - BOOST_CHECK(min_iter == vector.begin() + 2); - BOOST_CHECK_EQUAL(*min_iter, 1); + BOOST_CHECK(min_iter == vector.begin() + 512); + BOOST_CHECK_EQUAL((vector.begin() + 512).read(queue), 0); + BOOST_CHECK_EQUAL(min_iter.read(queue), 0); boost::compute::vector::iterator max_iter = boost::compute::max_element(vector.begin(), vector.end(), queue); - BOOST_CHECK(max_iter == vector.begin() + 1); - BOOST_CHECK_EQUAL(*max_iter, 15); + BOOST_CHECK(max_iter == vector.end() - 512); + BOOST_CHECK_EQUAL((vector.end() - 512).read(queue), 513); + BOOST_CHECK_EQUAL(max_iter.read(queue), 513); } BOOST_AUTO_TEST_CASE(int2_min_max_custom_comparision_function) From 2d972fe2a4b7318e40db9451ea93cfa15808476b Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Thu, 24 Sep 2015 20:29:25 +0200 Subject: [PATCH 2/3] Fix find_extrema_with_reduce Now find_extrema_with_reduce always return the first extremum in the given range. --- .../detail/find_extrema_with_reduce.hpp | 373 +++++++++++------- 1 file changed, 238 insertions(+), 135 deletions(-) diff --git a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp index 55fb688b..a157e248 100644 --- a/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp +++ b/include/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp @@ -12,11 +12,11 @@ #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP #include -#include #include #include #include +#include #include #include #include @@ -24,7 +24,6 @@ #include #include #include -#include namespace boost { namespace compute { @@ -73,16 +72,24 @@ bool find_extrema_with_reduce_requirements_met(InputIterator first, return ((required_local_mem_size * 4) <= local_mem_size); } +/// \internal_ +/// Algorithm finds the first extremum in given range, i.e., with the lowest +/// index. +/// +/// If \p use_input_idx is false, it's assumed that input data is ordered by +/// increasing index and \p input_idx is not used in the algorithm. template -inline size_t find_extrema_with_reduce(InputIterator first, - size_t count, - ResultIterator result, - vector::iterator result_idx, - size_t work_groups_no, - size_t work_group_size, - Compare compare, - const bool find_minimum, - command_queue &queue) +inline void find_extrema_with_reduce(InputIterator input, + vector::iterator input_idx, + size_t count, + ResultIterator result, + vector::iterator result_idx, + size_t work_groups_no, + size_t work_group_size, + Compare compare, + const bool find_minimum, + const bool use_input_idx, + command_queue &queue) { typedef typename std::iterator_traits::value_type input_type; @@ -90,44 +97,68 @@ inline size_t find_extrema_with_reduce(InputIterator first, meta_kernel k("find_extrema_reduce"); size_t count_arg = k.add_arg("count"); - size_t output_arg = k.add_arg(memory_object::global_memory, "output"); - size_t output_idx_arg = k.add_arg(memory_object::global_memory, "output_idx"); size_t block_arg = k.add_arg(memory_object::local_memory, "block"); size_t block_idx_arg = k.add_arg(memory_object::local_memory, "block_idx"); k << // Work item global id k.decl("gid") << " = get_global_id(0);\n" << - // - "if(gid >= count) {\n return;\n }\n" << + "if(gid >= count) {\n return;\n}\n" << // Index of element that will be read from input buffer k.decl("idx") << " = gid;\n" << k.decl("acc") << ";\n" << - // Index of currently best element + // Real index of currently best element + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + k.decl("acc_idx") << " = " << input_idx[k.var("idx")] << ";\n" << + "#else\n" << k.decl("acc_idx") << " = idx;\n" << + "#endif\n" << // Init accumulator with first[get_global_id(0)] - "acc = " << first[k.var("idx")] << ";\n" << + "acc = " << input[k.var("idx")] << ";\n" << "idx += get_global_size(0);\n" << k.decl("compare_result") << ";\n" << + k.decl("equal") << ";\n\n" << "while( idx < count ){\n" << // Next element - k.decl("next") << " = " << first[k.var("idx")] << ";\n" << + k.decl("next") << " = " << input[k.var("idx")] << ";\n" << + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + k.decl("next_idx") << " = " << input_idx[k.var("idx")] << ";\n" << + "#endif\n" << + // Comparison between currently best element (acc) and next element - "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << - "compare_result = " << compare(k.var("acc"), - k.var("next")) << ";\n" << - "#else\n" << + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var("next"), k.var("acc")) << ";\n" << + "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "equal = !compare_result && !" << + compare(k.var("acc"), + k.var("next")) << ";\n" << + "# endif\n" << + "#else\n" << + "compare_result = " << compare(k.var("acc"), + k.var("next")) << ";\n" << + "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "equal = !compare_result && !" << + compare(k.var("next"), + k.var("acc")) << ";\n" << + "# endif\n" << "#endif\n" << + + // save the winner "acc = compare_result ? acc : next;\n" << + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "acc_idx = compare_result ? " << + "acc_idx : " << + "(equal ? min(acc_idx, next_idx) : next_idx);\n" << + "#else\n" << "acc_idx = compare_result ? acc_idx : idx;\n" << + "#endif\n" << "idx += get_global_size(0);\n" << - "}\n" << + "}\n\n" << // Work item local id k.decl("lid") << " = get_local_id(0);\n" << @@ -135,7 +166,8 @@ inline size_t find_extrema_with_reduce(InputIterator first, "block_idx[lid] = acc_idx;\n" << "barrier(CLK_LOCAL_MEM_FENCE);\n" << - k.decl("group_offset") << " = count - (get_local_size(0) * get_group_id(0));\n"; + k.decl("group_offset") << + " = count - (get_local_size(0) * get_group_id(0));\n\n"; k << "#pragma unroll\n" @@ -144,35 +176,46 @@ inline size_t find_extrema_with_reduce(InputIterator first, "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << k.decl("mine") << " = block[lid];\n" << k.decl("other") << " = block[lid+offset];\n" << - "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << - "compare_result = " << compare(k.var("mine"), - k.var("other")) << ";\n" << - "#else\n" << + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << "compare_result = " << compare(k.var("other"), k.var("mine")) << ";\n" << + "equal = !compare_result && !" << + compare(k.var("mine"), + k.var("other")) << ";\n" << + "#else\n" << + "compare_result = " << compare(k.var("mine"), + k.var("other")) << ";\n" << + "equal = !compare_result && !" << + compare(k.var("other"), + k.var("mine")) << ";\n" << "#endif\n" << "block[lid] = compare_result ? mine : other;\n" << + k.decl("mine_idx") << " = block_idx[lid];\n" << + k.decl("other_idx") << " = block_idx[lid+offset];\n" << "block_idx[lid] = compare_result ? " << - "block_idx[lid] : block_idx[lid+offset];\n" << + "mine_idx : " << + "(equal ? min(mine_idx, other_idx) : other_idx);\n" << "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" << - "}\n" << + "}\n\n" << // write block result to global output "if(lid == 0){\n" << - " output[get_group_id(0)] = block[0];\n" << - " output_idx[get_group_id(0)] = block_idx[0];\n" << + result[k.var("get_group_id(0)")] << " = block[0];\n" << + result_idx[k.var("get_group_id(0)")] << " = block_idx[0];\n" << "}"; std::string options; if(!find_minimum){ options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; } + if(use_input_idx){ + options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; + } + kernel kernel = k.compile(context, options); kernel.set_arg(count_arg, static_cast(count)); - kernel.set_arg(output_arg, result.get_buffer()); - kernel.set_arg(output_idx_arg, result_idx.get_buffer()); kernel.set_arg(block_arg, local_buffer(work_group_size)); kernel.set_arg(block_idx_arg, local_buffer(work_group_size)); @@ -180,98 +223,25 @@ inline size_t find_extrema_with_reduce(InputIterator first, 0, work_groups_no * work_group_size, work_group_size); - - return 0; } -template -uint_ find_extrema_final(InputIterator candidates, - vector::iterator candidates_idx, - const size_t count, - Compare compare, - const bool find_minimum, - const size_t work_group_size, - command_queue &queue) +template +inline void find_extrema_with_reduce(InputIterator input, + size_t count, + ResultIterator result, + vector::iterator result_idx, + size_t work_groups_no, + size_t work_group_size, + Compare compare, + const bool find_minimum, + command_queue &queue) { - typedef typename std::iterator_traits::value_type input_type; - - const context &context = queue.get_context(); - - // device vectors for the result - vector result(1, context); - vector result_idx(1, context); - - // get extremum from among the candidates - find_extrema_with_reduce( - candidates, count, result.begin(), result_idx.begin(), - 1, work_group_size, compare, find_minimum, queue + // dummy will not be used + buffer_iterator dummy = result_idx; + return find_extrema_with_reduce( + input, dummy, count, result, result_idx, work_groups_no, + work_group_size, compare, find_minimum, false, queue ); - - // get candidate index - const uint_ idx = (result_idx.begin()).read(queue); - // get extremum index - typename vector::iterator extremum_idx = candidates_idx + idx; - - // return extremum index - return extremum_idx.read(queue); -} - -template -uint_ find_extrema_final(InputIterator candidates, - vector::iterator candidates_idx, - const size_t count, - ::boost::compute::less< - typename std::iterator_traits::value_type - > compare, - const bool find_minimum, - const size_t work_group_size, - command_queue &queue) -{ - (void) work_group_size; - - typedef typename std::iterator_traits::difference_type difference_type; - typedef typename std::iterator_traits::value_type input_type; - - // host vectors - std::vector host_candidates(count); - std::vector host_candidates_idx(count); - - InputIterator candidates_last = - candidates + static_cast(count); - vector::iterator candidates_idx_last = - candidates_idx + count; - - // copying extremum candidates found by find_extrema_reduce(...) to host - ::boost::compute::copy(candidates_idx, candidates_idx_last, - host_candidates_idx.begin(), queue); - ::boost::compute::copy(candidates, candidates_last, - host_candidates.begin(), queue); - - typename std::vector::iterator i = host_candidates.begin(); - std::vector::iterator idx = host_candidates_idx.begin(); - std::vector::iterator extremum_idx = idx; - input_type extremum = *i; - - // find extremum from among the candidates - if(!find_minimum) { - while(idx != host_candidates_idx.end()) { - bool compare_result = *i > extremum; - extremum = compare_result ? *i : extremum; - extremum_idx = compare_result ? idx : extremum_idx; - idx++, i++; - } - } - else { - while(idx != host_candidates_idx.end()) { - bool compare_result = *i < extremum; - extremum = compare_result ? *i : extremum; - extremum_idx = compare_result ? idx : extremum_idx; - idx++, i++; - } - } - - // return extremum index - return (*extremum_idx); } template @@ -293,6 +263,89 @@ InputIterator find_extrema_with_reduce(InputIterator first, const size_t count = detail::iterator_range_size(first, last); + std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + + type_name(); + + // load parameters + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + // get preferred work group size and preferred number + // of work groups per compute unit + size_t work_group_size = parameters->get(cache_key, "wgsize", 256); + size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); + + // calculate work group size and number of work groups + work_group_size = (std::min)(max_work_group_size, work_group_size); + size_t work_groups_no = compute_units_no * work_groups_per_cu; + work_groups_no = (std::min)( + work_groups_no, + static_cast(std::ceil(float(count) / work_group_size)) + ); + + // phase I: finding candidates for extremum + + // device buffors for extremum candidates and their indices + // each work-group computes its candidate + vector candidates(work_groups_no, context); + vector candidates_idx(work_groups_no, context); + + // finding candidates for first extremum and their indices + find_extrema_with_reduce( + first, count, candidates.begin(), candidates_idx.begin(), + work_groups_no, work_group_size, compare, find_minimum, queue + ); + + // phase II: finding extremum from among the candidates + + // zero-copy buffers for final result (value and index) + vector > + result(1, context); + vector > + result_idx(1, context); + + // get extremum from among the candidates + find_extrema_with_reduce( + candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), + result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue + ); + + // mapping extremum index to host + uint_* result_idx_host_ptr = + static_cast( + queue.enqueue_map_buffer( + result_idx.get_buffer(), command_queue::map_read, + 0, sizeof(uint_) + ) + ); + + return first + static_cast(*result_idx_host_ptr); +} + +template +InputIterator find_extrema_with_reduce(InputIterator first, + InputIterator last, + ::boost::compute::less< + typename std::iterator_traits< + InputIterator + >::value_type + > + compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits::difference_type difference_type; + typedef typename std::iterator_traits::value_type input_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // Getting information about used queue and device + const size_t compute_units_no = device.get_info(); + const size_t max_work_group_size = device.get_info(); + + const size_t count = detail::iterator_range_size(first, last); + std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + type_name(); @@ -309,26 +362,76 @@ InputIterator find_extrema_with_reduce(InputIterator first, work_group_size = (std::min)(max_work_group_size, work_group_size); size_t work_groups_no = compute_units_no * work_groups_per_cu; work_groups_no = (std::min)( - work_groups_no, - static_cast(std::ceil(float(count) / work_group_size))); + work_groups_no, + static_cast(std::ceil(float(count) / work_group_size)) + ); - // device vectors for extremum candidates and their indices - vector candidates(work_groups_no, context); - vector candidates_idx(work_groups_no, context); + // phase I: finding candidates for extremum - // find extremum candidates and their indices + // device buffors for extremum candidates and their indices + // each work-group computes its candidate + // zero-copy buffers are used to eliminate copying data back to host + vector > + candidates(work_groups_no, context); + vector > + candidates_idx(work_groups_no, context); + + // finding candidates for first extremum and their indices find_extrema_with_reduce( first, count, candidates.begin(), candidates_idx.begin(), work_groups_no, work_group_size, compare, find_minimum, queue - ); - - // get extremum index - const uint_ extremum_idx = find_extrema_final( - candidates.begin(), candidates_idx.begin(), work_groups_no, compare, - find_minimum, work_group_size, queue ); - return first + static_cast(extremum_idx); + // phase II: finding extremum from among the candidates + + // mapping candidates and their indices to host + input_type* candidates_host_ptr = + static_cast( + queue.enqueue_map_buffer( + candidates.get_buffer(), command_queue::map_read, + 0, work_groups_no * sizeof(input_type) + ) + ); + + uint_* candidates_idx_host_ptr = + static_cast( + queue.enqueue_map_buffer( + candidates_idx.get_buffer(), command_queue::map_read, + 0, work_groups_no * sizeof(uint_) + ) + ); + + input_type* i = candidates_host_ptr; + uint_* idx = candidates_idx_host_ptr; + uint_* extremum_idx = idx; + input_type extremum = *candidates_host_ptr; + i++; idx++; + + // find extremum (serial) from among the candidates on host + if(!find_minimum) { + while(idx != (candidates_idx_host_ptr + work_groups_no)) { + input_type next = *i; + bool compare_result = next > extremum; + bool equal = next == extremum; + extremum = compare_result ? next : extremum; + extremum_idx = compare_result ? idx : extremum_idx; + extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; + idx++, i++; + } + } + else { + while(idx != (candidates_idx_host_ptr + work_groups_no)) { + input_type next = *i; + bool compare_result = next < extremum; + bool equal = next == extremum; + extremum = compare_result ? next : extremum; + extremum_idx = compare_result ? idx : extremum_idx; + extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; + idx++, i++; + } + } + + return first + static_cast(*extremum_idx); } } // end detail namespace From 8099e07f40f2f8a6f678f3e2b2b209461b976c15 Mon Sep 17 00:00:00 2001 From: Jakub Szuppe Date: Sat, 3 Oct 2015 16:07:18 +0200 Subject: [PATCH 3/3] Fix find_extrema_with_atomics Now find_extrema_with_atomics always return the first extremum in the given range. --- .../detail/find_extrema_with_atomics.hpp | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/include/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp b/include/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp index 5ebaf835..406d1bec 100644 --- a/include/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp +++ b/include/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp @@ -29,6 +29,7 @@ inline InputIterator find_extrema_with_atomics(InputIterator first, const bool find_minimum, command_queue &queue) { + typedef typename std::iterator_traits::value_type value_type; typedef typename std::iterator_traits::difference_type difference_type; const context &context = queue.get_context(); @@ -40,12 +41,32 @@ inline InputIterator find_extrema_with_atomics(InputIterator first, "const uint gid = get_global_id(0);\n" << "uint old_index = *index;\n" << - "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << - "while(" << compare(first[k.var("gid")], - first[k.var("old_index")]) << "){\n" << + k.decl("old") << + " = " << first[k.var("old_index")] << ";\n" << + k.decl("new") << + " = " << first[k.var("gid")] << ";\n" << + + k.decl("compare_result") << ";\n" << + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << + "while(" << + "(compare_result = " << compare(k.var("old"), + k.var("new")) << ")" << + " || (!(compare_result" << + " || " << compare(k.var("new"), + k.var("old")) << ") " + "&& gid < old_index)){\n" << "#else\n" << - "while(" << compare(first[k.var("old_index")], - first[k.var("gid")]) << "){\n" << + // while condition explained for minimum case with less (<) + // as comparison function: + // while(new_value < old_value + // OR (new_value == old_value AND new_index < old_index)) + "while(" << + "(compare_result = " << compare(k.var("new"), + k.var("old")) << ")" << + " || (!(compare_result" << + " || " << compare(k.var("old"), + k.var("new")) << ") " + "&& gid < old_index)){\n" << "#endif\n" << " if(" << atomic_cmpxchg_uint(k.var("index"), @@ -54,6 +75,7 @@ inline InputIterator find_extrema_with_atomics(InputIterator first, " break;\n" << " else\n" << " old_index = *index;\n" << + "old = " << first[k.var("old_index")] << ";\n" << "}\n"; size_t index_arg_index = k.add_arg(memory_object::global_memory, "index");