diff --git a/.travis.yml b/.travis.yml index 7d83131d..21074f95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,14 +3,16 @@ compiler: - gcc - clang before_install: + - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update -qq - - sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers libboost-chrono1.48-dev libboost-date-time1.48-dev libboost-test1.48-dev libboost-system1.48-dev libboost-filesystem1.48-dev libboost-timer1.48-dev libboost-program-options1.48-dev libboost-thread1.48-dev python-yaml lcov libopencv-dev + - sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers libboost-chrono1.48-dev libboost-date-time1.48-dev libboost-test1.48-dev libboost-system1.48-dev libboost-filesystem1.48-dev libboost-timer1.48-dev libboost-program-options1.48-dev libboost-thread1.48-dev python-yaml lcov libopencv-dev g++-4.8 - gem install coveralls-lcov + - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi script: - mkdir -p build - cd build - cmake -DBOOST_COMPUTE_BUILD_TESTS=ON -DBOOST_COMPUTE_BUILD_EXAMPLES=ON -DBOOST_COMPUTE_BUILD_BENCHMARKS=ON -DBOOST_COMPUTE_USE_OFFLINE_CACHE=ON -DBOOST_COMPUTE_ENABLE_COVERAGE=ON -DBOOST_COMPUTE_HAVE_OPENCV=ON -DBOOST_COMPUTE_THREAD_SAFE=ON -DCMAKE_CXX_FLAGS="-Wall -pedantic -Werror -Wno-variadic-macros -Wno-long-long -Wno-shadow" .. - - make -j8 + - make -j4 - ./example/list_devices - ctest --output-on-failure - ctest --output-on-failure diff --git a/include/boost/compute/algorithm/copy_if.hpp b/include/boost/compute/algorithm/copy_if.hpp index cd6c9619..2d52f29c 100644 --- a/include/boost/compute/algorithm/copy_if.hpp +++ b/include/boost/compute/algorithm/copy_if.hpp @@ -11,87 +11,13 @@ #ifndef BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP #define BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include namespace boost { namespace compute { namespace detail { -template -inline OutputIterator copy_if_impl(InputIterator first, - InputIterator last, - OutputIterator result, - Predicate predicate, - bool copyIndex, - command_queue &queue) -{ - typedef typename std::iterator_traits::difference_type difference_type; - - size_t count = detail::iterator_range_size(first, last); - if(count == 0){ - return result; - } - - const context &context = queue.get_context(); - - // storage for destination indices - ::boost::compute::vector indices(count, context); - - // write counts - ::boost::compute::detail::meta_kernel k1("copy_if_write_counts"); - k1 << indices.begin()[k1.get_global_id(0)] << " = " - << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n"; - k1.exec_1d(queue, 0, count); - - // count number of elements to be copied - size_t copied_element_count = - ::boost::compute::count(indices.begin(), indices.end(), 1, queue); - - // scan indices - ::boost::compute::exclusive_scan(indices.begin(), - indices.end(), - indices.begin(), - queue); - - // copy values - ::boost::compute::detail::meta_kernel k2("copy_if_do_copy"); - k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" << - " " << result[indices.begin()[k2.get_global_id(0)]] << "="; - - if(copyIndex){ - k2 << k2.get_global_id(0) << ";\n"; - } - else { - k2 << first[k2.get_global_id(0)] << ";\n"; - } - - k2.exec_1d(queue, 0, count); - - return result + static_cast(copied_element_count); -} - -template -inline discard_iterator copy_if_impl(InputIterator first, - InputIterator last, - discard_iterator result, - Predicate predicate, - bool copyIndex, - command_queue &queue) -{ - (void) copyIndex; - - return result + count_if(first, last, predicate, queue); -} - // like the copy_if() algorithm but writes the indices of the values for which // predicate returns true. template @@ -101,7 +27,11 @@ inline OutputIterator copy_index_if(InputIterator first, Predicate predicate, command_queue &queue = system::default_queue()) { - return detail::copy_if_impl(first, last, result, predicate, true, queue); + typedef typename std::iterator_traits::value_type T; + + return detail::transform_if_impl( + first, last, result, identity(), predicate, true, queue + ); } } // end detail namespace @@ -115,7 +45,11 @@ inline OutputIterator copy_if(InputIterator first, Predicate predicate, command_queue &queue = system::default_queue()) { - return detail::copy_if_impl(first, last, result, predicate, false, queue); + typedef typename std::iterator_traits::value_type T; + + return ::boost::compute::transform_if( + first, last, result, identity(), predicate, queue + ); } } // end compute namespace diff --git a/include/boost/compute/algorithm/detail/binary_find.hpp b/include/boost/compute/algorithm/detail/binary_find.hpp index 59b0532d..44877609 100644 --- a/include/boost/compute/algorithm/detail/binary_find.hpp +++ b/include/boost/compute/algorithm/detail/binary_find.hpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace boost { namespace compute { @@ -28,11 +29,9 @@ namespace detail{ class binary_find_kernel : public meta_kernel { public: - size_t threads; - - binary_find_kernel() : meta_kernel("binary_find") + binary_find_kernel(size_t threads) : meta_kernel("binary_find") { - threads = 128; + m_threads = threads; } template @@ -41,7 +40,7 @@ public: UnaryPredicate predicate) { typedef typename std::iterator_traits::value_type value_type; - int block = (iterator_range_size(first, last)-1)/(threads-1); + int block = (iterator_range_size(first, last)-1)/(m_threads-1); m_index_arg = add_arg(memory_object::global_memory, "index"); @@ -60,10 +59,11 @@ public: { set_arg(m_index_arg, index.get_buffer()); - return exec_1d(queue, 0, threads); + return exec_1d(queue, 0, m_threads); } private: + size_t m_threads; size_t m_index_arg; }; @@ -84,8 +84,15 @@ inline InputIterator binary_find(InputIterator first, UnaryPredicate predicate, command_queue &queue = system::default_queue()) { + const device &device = queue.get_device(); + + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + const std::string cache_key = "__boost_binary_find"; + size_t find_if_limit = 128; - size_t threads = 128; + size_t threads = parameters->get(cache_key, "tpb", 128); size_t count = iterator_range_size(first, last); while(count > find_if_limit) { @@ -93,7 +100,7 @@ inline InputIterator binary_find(InputIterator first, scalar index(queue.get_context()); index.write(static_cast(count), queue); - binary_find_kernel kernel; + binary_find_kernel kernel(threads); kernel.set_range(first, last, predicate); kernel.exec(queue, index); diff --git a/include/boost/compute/algorithm/detail/copy_on_device.hpp b/include/boost/compute/algorithm/detail/copy_on_device.hpp index c746a2e8..c20a70c3 100644 --- a/include/boost/compute/algorithm/detail/copy_on_device.hpp +++ b/include/boost/compute/algorithm/detail/copy_on_device.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include namespace boost { @@ -42,12 +43,21 @@ template class copy_kernel : public meta_kernel { public: - copy_kernel() + copy_kernel(const device &device) : meta_kernel("copy") { m_count = 0; - m_vpt = 4; - m_tpb = 128; + + typedef typename std::iterator_traits::value_type input_type; + + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + std::string cache_key = + "__boost_copy_kernel_" + boost::lexical_cast(sizeof(input_type)); + + m_vpt = parameters->get(cache_key, "vpt", 4); + m_tpb = parameters->get(cache_key, "tpb", 128); } void set_range(InputIterator first, @@ -97,7 +107,9 @@ inline OutputIterator copy_on_device(InputIterator first, OutputIterator result, command_queue &queue) { - copy_kernel kernel; + const device &device = queue.get_device(); + + copy_kernel kernel(device); kernel.set_range(first, last, result); kernel.exec(queue); @@ -122,7 +134,9 @@ inline future copy_on_device_async(InputIterator first, OutputIterator result, command_queue &queue) { - copy_kernel kernel; + const device &device = queue.get_device(); + + copy_kernel kernel(device); kernel.set_range(first, last, result); event event_ = kernel.exec(queue); diff --git a/include/boost/compute/algorithm/detail/inplace_reduce.hpp b/include/boost/compute/algorithm/detail/inplace_reduce.hpp index 048332aa..3c300997 100644 --- a/include/boost/compute/algorithm/detail/inplace_reduce.hpp +++ b/include/boost/compute/algorithm/detail/inplace_reduce.hpp @@ -13,6 +13,8 @@ #include +#include + #include #include #include diff --git a/include/boost/compute/algorithm/detail/radix_sort.hpp b/include/boost/compute/algorithm/detail/radix_sort.hpp index 98be26fc..58140b3d 100644 --- a/include/boost/compute/algorithm/detail/radix_sort.hpp +++ b/include/boost/compute/algorithm/detail/radix_sort.hpp @@ -23,9 +23,10 @@ #include #include #include +#include +#include #include #include -#include #include namespace boost { @@ -232,19 +233,9 @@ inline void radix_sort_impl(const buffer_iterator first, typedef T value_type; typedef typename radix_sort_value_type::type sort_type; + const device &device = queue.get_device(); const context &context = queue.get_context(); - size_t count = detail::iterator_range_size(first, last); - - // sort parameters - const uint_ k = 4; - const uint_ k2 = 1 << k; - const uint_ block_size = 128; - - uint_ block_count = static_cast(count / block_size); - if(block_count * block_size != count){ - block_count++; - } // if we have a valid values iterator then we are doing a // sort by key and have to set up the values buffer @@ -258,6 +249,17 @@ inline void radix_sort_impl(const buffer_iterator first, cache_key += std::string("_with_") + type_name(); } + boost::shared_ptr cache = + program_cache::get_global_cache(context); + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); + + // sort parameters + const uint_ k = parameters->get(cache_key, "k", 4); + const uint_ k2 = 1 << k; + const uint_ block_size = parameters->get(cache_key, "tpb", 128); + + // sort program compiler options std::stringstream options; options << "-DK_BITS=" << k; options << " -DT=" << type_name(); @@ -277,17 +279,22 @@ inline void radix_sort_impl(const buffer_iterator first, options << enable_double(); } - // load (or create) radix sort program - boost::shared_ptr cache = - program_cache::get_global_cache(context); - - program radix_sort_program = - cache->get_or_build(cache_key, options.str(), radix_sort_source, context); + // load radix sort program + program radix_sort_program = cache->get_or_build( + cache_key, options.str(), radix_sort_source, context + ); kernel count_kernel(radix_sort_program, "count"); kernel scan_kernel(radix_sort_program, "scan"); kernel scatter_kernel(radix_sort_program, "scatter"); + size_t count = detail::iterator_range_size(first, last); + + uint_ block_count = static_cast(count / block_size); + if(block_count * block_size != count){ + block_count++; + } + // setup temporary buffers vector output(count, context); vector values_output(sort_by_key ? count : 0, context); diff --git a/include/boost/compute/algorithm/detail/reduce_on_gpu.hpp b/include/boost/compute/algorithm/detail/reduce_on_gpu.hpp index 4f5be7f3..493f23ea 100644 --- a/include/boost/compute/algorithm/detail/reduce_on_gpu.hpp +++ b/include/boost/compute/algorithm/detail/reduce_on_gpu.hpp @@ -13,15 +13,15 @@ #include +#include #include #include #include +#include #include #include -#include #include #include -#include namespace boost { namespace compute { @@ -99,7 +99,7 @@ inline void initial_reduce(InputIterator first, (void) reduce_kernel; typedef typename std::iterator_traits::value_type Arg; - typedef typename ::boost::compute::result_of::type T; + typedef typename boost::tr1_result_of::type T; size_t count = std::distance(first, last); detail::meta_kernel k("initial_reduce"); @@ -174,6 +174,7 @@ inline void reduce_on_gpu(InputIterator first, command_queue &queue) { const device &device = queue.get_device(); + const context &context = queue.get_context(); detail::meta_kernel k("reduce"); k.add_arg(memory_object::global_memory, "input"); @@ -210,28 +211,34 @@ inline void reduce_on_gpu(InputIterator first, " output[output_offset + get_group_id(0)] = scratch[0];\n" << "}\n"; - uint_ vpt = 8; - uint_ tpb = 128; + std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name(); - size_t count = std::distance(first, last); + // load parameters + boost::shared_ptr parameters = + detail::parameter_cache::get_global_cache(device); - const context &context = queue.get_context(); + uint_ vpt = parameters->get(cache_key, "vpt", 8); + uint_ tpb = parameters->get(cache_key, "tpb", 128); - // load (or create) reduce program + // reduce program compiler flags + std::stringstream options; + options << "-DT=" << type_name() + << " -DVPT=" << vpt + << " -DTPB=" << tpb; + + // load program boost::shared_ptr cache = program_cache::get_global_cache(context); - std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name(); - - std::stringstream options; - options << "-DT=" << type_name() << " -DVPT=" << vpt << " -DTPB=" << tpb; - - program reduce_program = - cache->get_or_build(cache_key, options.str(), k.source(), context); + program reduce_program = cache->get_or_build( + cache_key, options.str(), k.source(), context + ); // create reduce kernel kernel reduce_kernel(reduce_program, "reduce"); + size_t count = std::distance(first, last); + // first pass, reduce from input to ping buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); diff --git a/include/boost/compute/algorithm/reverse_copy.hpp b/include/boost/compute/algorithm/reverse_copy.hpp index de47d6b9..fe92e386 100644 --- a/include/boost/compute/algorithm/reverse_copy.hpp +++ b/include/boost/compute/algorithm/reverse_copy.hpp @@ -20,6 +20,33 @@ namespace boost { namespace compute { +namespace detail { + +template +struct reverse_copy_kernel : public meta_kernel +{ + reverse_copy_kernel(Iterator first, Iterator last, OutputIterator result) + : meta_kernel("reverse_copy") + { + // store size of the range + m_size = detail::iterator_range_size(first, last); + add_set_arg("size", static_cast(m_size)); + + *this << + decl("i") << " = get_global_id(0);\n" << + decl("j") << " = size - get_global_id(0) - 1;\n" << + result[var("j")] << "=" << first[var("i")] << ";\n"; + } + + void exec(command_queue &queue) + { + exec_1d(queue, 0, m_size); + } + + size_t m_size; +}; + +} // end detail namespace /// Copies the elements in the range [\p first, \p last) in reversed /// order to the range beginning at \p result. @@ -36,11 +63,11 @@ reverse_copy(InputIterator first, difference_type count = std::distance(first, last); - // copy data to result - ::boost::compute::copy(first, last, result, queue); + detail::reverse_copy_kernel + kernel(first, last, result); - // reverse result - ::boost::compute::reverse(result, result + count, queue); + // run kernel + kernel.exec(queue); // return iterator to the end of result return result + count; diff --git a/include/boost/compute/algorithm/transform_if.hpp b/include/boost/compute/algorithm/transform_if.hpp new file mode 100644 index 00000000..bf727a0a --- /dev/null +++ b/include/boost/compute/algorithm/transform_if.hpp @@ -0,0 +1,117 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2015 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace boost { +namespace compute { +namespace detail { + +template +inline OutputIterator transform_if_impl(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryFunction function, + Predicate predicate, + bool copyIndex, + command_queue &queue) +{ + typedef typename std::iterator_traits::difference_type difference_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return result; + } + + const context &context = queue.get_context(); + + // storage for destination indices + ::boost::compute::vector indices(count, context); + + // write counts + ::boost::compute::detail::meta_kernel k1("transform_if_write_counts"); + k1 << indices.begin()[k1.get_global_id(0)] << " = " + << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n"; + k1.exec_1d(queue, 0, count); + + // count number of elements to be copied + size_t copied_element_count = + ::boost::compute::count(indices.begin(), indices.end(), 1, queue); + + // scan indices + ::boost::compute::exclusive_scan( + indices.begin(), indices.end(), indices.begin(), queue + ); + + // copy values + ::boost::compute::detail::meta_kernel k2("transform_if_do_copy"); + k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" << + " " << result[indices.begin()[k2.get_global_id(0)]] << "="; + + if(copyIndex){ + k2 << k2.get_global_id(0) << ";\n"; + } + else { + k2 << function(first[k2.get_global_id(0)]) << ";\n"; + } + + k2.exec_1d(queue, 0, count); + + return result + static_cast(copied_element_count); +} + +template +inline discard_iterator transform_if_impl(InputIterator first, + InputIterator last, + discard_iterator result, + UnaryFunction function, + Predicate predicate, + bool copyIndex, + command_queue &queue) +{ + (void) function; + (void) copyIndex; + + return result + count_if(first, last, predicate, queue); +} + +} // end detail namespace + +/// Copies each element in the range [\p first, \p last) for which +/// \p predicate returns \c true to the range beginning at \p result. +template +inline OutputIterator transform_if(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryFunction function, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + return detail::transform_if_impl( + first, last, result, function, predicate, false, queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP diff --git a/include/boost/compute/detail/get_object_info.hpp b/include/boost/compute/detail/get_object_info.hpp index 1c54ed49..c5830a29 100644 --- a/include/boost/compute/detail/get_object_info.hpp +++ b/include/boost/compute/detail/get_object_info.hpp @@ -14,6 +14,9 @@ #include #include +#include +#include + #include #include diff --git a/include/boost/compute/detail/parameter_cache.hpp b/include/boost/compute/detail/parameter_cache.hpp new file mode 100644 index 00000000..64e4d281 --- /dev/null +++ b/include/boost/compute/detail/parameter_cache.hpp @@ -0,0 +1,215 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2015 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_DETAIL_PARAMETER_CACHE_HPP +#define BOOST_COMPUTE_DETAIL_PARAMETER_CACHE_HPP + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE +#include +#include +#include +#include +#endif // BOOST_COMPUTE_USE_OFFLINE_CACHE + +namespace boost { +namespace compute { +namespace detail { + +class parameter_cache : boost::noncopyable +{ +public: + parameter_cache(const device &device) + : m_dirty(false), + m_device_name(device.name()) + { + #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE + // get offline cache file name (e.g. /home/user/.boost_compute/tune/device.json) + m_file_name = make_file_name(); + + // load parameters from offline cache file (if it exists) + if(boost::filesystem::exists(m_file_name)){ + read_from_disk(); + } + #endif // BOOST_COMPUTE_USE_OFFLINE_CACHE + } + + ~parameter_cache() + { + #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE + write_to_disk(); + #endif // BOOST_COMPUTE_USE_OFFLINE_CACHE + } + + void set(const std::string &object, const std::string ¶meter, uint_ value) + { + m_cache[std::make_pair(object, parameter)] = value; + + // set the dirty flag to true. this will cause the updated parameters + // to be stored to disk. + m_dirty = true; + } + + uint_ get(const std::string &object, const std::string ¶meter, uint_ default_value) + { + std::map, uint_>::iterator + iter = m_cache.find(std::make_pair(object, parameter)); + if(iter != m_cache.end()){ + return iter->second; + } + else { + return default_value; + } + } + + static boost::shared_ptr get_global_cache(const device &device) + { + // device name -> parameter cache + typedef std::map > cache_map; + + BOOST_COMPUTE_DETAIL_GLOBAL_STATIC(cache_map, caches, ((std::less()))); + + cache_map::iterator iter = caches.find(device.name()); + if(iter == caches.end()){ + boost::shared_ptr cache = + boost::make_shared(device); + + caches.insert(iter, std::make_pair(device.name(), cache)); + + return cache; + } + else { + return iter->second; + } + } + +private: +#ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE + // returns a string containing a cannoical device name + static std::string cannonical_device_name(std::string name) + { + boost::algorithm::trim(name); + std::replace(name.begin(), name.end(), ' ', '_'); + std::replace(name.begin(), name.end(), '(', '_'); + std::replace(name.begin(), name.end(), ')', '_'); + return name; + } + + // returns the boost.compute version string + static std::string version_string() + { + char buf[32]; + std::snprintf(buf, sizeof(buf), "%d.%d.%d", BOOST_COMPUTE_VERSION_MAJOR, + BOOST_COMPUTE_VERSION_MINOR, + BOOST_COMPUTE_VERSION_PATCH); + return buf; + } + + // returns the file path for the cached parameters + std::string make_file_name() const + { + return detail::parameter_cache_path(true) + cannonical_device_name(m_device_name) + ".json"; + } + + // store current parameters to disk + void write_to_disk() + { + BOOST_ASSERT(!m_file_name.empty()); + + if(m_dirty){ + // save current parameters to disk + boost::property_tree::ptree pt; + pt.put("header.device", m_device_name); + pt.put("header.version", version_string()); + typedef std::map, uint_> map_type; + for(map_type::const_iterator iter = m_cache.begin(); iter != m_cache.end(); ++iter){ + const std::pair &key = iter->first; + pt.add(key.first + "." + key.second, iter->second); + } + write_json(m_file_name, pt); + + m_dirty = false; + } + } + + // load stored parameters from disk + void read_from_disk() + { + BOOST_ASSERT(!m_file_name.empty()); + + m_cache.clear(); + + boost::property_tree::ptree pt; + try { + read_json(m_file_name, pt); + } + catch(boost::property_tree::json_parser::json_parser_error &e){ + // no saved cache file, ignore + return; + } + + std::string stored_device; + try { + stored_device = pt.get("header.device"); + } + catch(boost::property_tree::ptree_bad_path&){ + return; + } + + std::string stored_version; + try { + stored_version = pt.get("header.version"); + } + catch(boost::property_tree::ptree_bad_path&){ + return; + } + + if(stored_device == m_device_name && stored_version == version_string()){ + typedef boost::property_tree::ptree::const_iterator pt_iter; + for(pt_iter iter = pt.begin(); iter != pt.end(); ++iter){ + if(iter->first == "header"){ + // skip header + continue; + } + + boost::property_tree::ptree child_pt = pt.get_child(iter->first); + for(pt_iter child_iter = child_pt.begin(); child_iter != child_pt.end(); ++child_iter){ + set(iter->first, child_iter->first, boost::lexical_cast(child_iter->second.data())); + } + } + } + + m_dirty = false; + } +#endif // BOOST_COMPUTE_USE_OFFLINE_CACHE + +private: + bool m_dirty; + std::string m_device_name; + std::string m_file_name; + std::map, uint_> m_cache; +}; + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_DETAIL_PARAMETER_CACHE_HPP diff --git a/include/boost/compute/detail/path.hpp b/include/boost/compute/detail/path.hpp new file mode 100644 index 00000000..183129ab --- /dev/null +++ b/include/boost/compute/detail/path.hpp @@ -0,0 +1,73 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_DETAIL_PATH_HPP +#define BOOST_COMPUTE_DETAIL_PATH_HPP + +#include +#include +#include + +namespace boost { +namespace compute { +namespace detail { + +// Path delimiter symbol for the current OS. +static const std::string& path_delim() +{ + static const std::string delim = + boost::filesystem::path("/").make_preferred().string(); + return delim; +} + +// Path to appdata folder. +inline const std::string& appdata_path() +{ + #ifdef WIN32 + static const std::string appdata = detail::getenv("APPDATA") + + path_delim() + "boost_compute"; + #else + static const std::string appdata = detail::getenv("HOME") + + path_delim() + ".boost_compute"; + #endif + return appdata; +} + +// Path to cached binaries. +inline std::string program_binary_path(const std::string &hash, bool create = false) +{ + std::string dir = detail::appdata_path() + path_delim() + + hash.substr(0, 2) + path_delim() + + hash.substr(2); + + if(create && !boost::filesystem::exists(dir)){ + boost::filesystem::create_directories(dir); + } + + return dir + path_delim(); +} + +// Path to parameter caches. +inline std::string parameter_cache_path(bool create = false) +{ + const static std::string dir = appdata_path() + path_delim() + "tune"; + + if(create && !boost::filesystem::exists(dir)){ + boost::filesystem::create_directories(dir); + } + + return dir + path_delim(); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_DETAIL_PATH_HPP diff --git a/include/boost/compute/device.hpp b/include/boost/compute/device.hpp index ab4cbacc..29197dc5 100644 --- a/include/boost/compute/device.hpp +++ b/include/boost/compute/device.hpp @@ -11,11 +11,12 @@ #ifndef BOOST_COMPUTE_DEVICE_HPP #define BOOST_COMPUTE_DEVICE_HPP +#include #include #include -#include -#include +#include +#include #include #include @@ -213,7 +214,8 @@ public: { const std::vector extensions = this->extensions(); - return boost::find(extensions, name) != extensions.end(); + return std::find( + extensions.begin(), extensions.end(), name) != extensions.end(); } /// Returns the number of address bits. diff --git a/include/boost/compute/experimental/transform_if.hpp b/include/boost/compute/experimental/transform_if.hpp deleted file mode 100644 index 07531c0c..00000000 --- a/include/boost/compute/experimental/transform_if.hpp +++ /dev/null @@ -1,63 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2013 Kyle Lutz -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -// -// See http://kylelutz.github.com/compute for more information. -//---------------------------------------------------------------------------// - -#ifndef BOOST_COMPUTE_EXPERIMENTAL_TRANSFORM_IF_HPP -#define BOOST_COMPUTE_EXPERIMENTAL_TRANSFORM_IF_HPP - -#include -#include -#include -#include - -namespace boost { -namespace compute { -namespace experimental { - -template -inline OutputIterator transform_if(InputIterator first, - InputIterator last, - OutputIterator result, - UnaryOperator op, - Predicate predicate, - command_queue &queue) -{ - typedef typename - std::iterator_traits::difference_type difference_type; - - difference_type count = std::distance(first, last); - if(count < 1){ - return result; - } - - detail::meta_kernel k("transform_if"); - - k << - k.if_(predicate(first[k.get_global_id(0)])) << "{\n" << - result[k.get_global_id(0)] << '=' << - op(first[k.get_global_id(0)]) << ";\n" - "}\n"; - - const device &device = queue.get_device(); - const size_t work_group_size = - detail::pick_copy_work_group_size(count, device); - - k.exec_1d(queue, 0, count, work_group_size); - - return result + count; -} - -} // end experimental namespace -} // end compute namespace -} // end boost namespace - -#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP diff --git a/include/boost/compute/kernel.hpp b/include/boost/compute/kernel.hpp index 18b0a6ae..a58b7710 100644 --- a/include/boost/compute/kernel.hpp +++ b/include/boost/compute/kernel.hpp @@ -197,7 +197,7 @@ public: /// /// \see_opencl_ref{clGetKernelWorkGroupInfo} template - T get_work_group_info(const device &device, cl_kernel_work_group_info info) + T get_work_group_info(const device &device, cl_kernel_work_group_info info) const { return detail::get_object_info(clGetKernelWorkGroupInfo, m_kernel, info, device.id()); } diff --git a/include/boost/compute/platform.hpp b/include/boost/compute/platform.hpp index 7dc68d8d..4b6533f7 100644 --- a/include/boost/compute/platform.hpp +++ b/include/boost/compute/platform.hpp @@ -11,11 +11,12 @@ #ifndef BOOST_COMPUTE_PLATFORM_HPP #define BOOST_COMPUTE_PLATFORM_HPP +#include #include #include -#include -#include +#include +#include #include #include @@ -112,7 +113,8 @@ public: { const std::vector extensions = this->extensions(); - return boost::find(extensions, name) != extensions.end(); + return std::find( + extensions.begin(), extensions.end(), name) != extensions.end(); } /// Returns a list of devices on the platform. diff --git a/include/boost/compute/program.hpp b/include/boost/compute/program.hpp index 0daf5e2a..c85378df 100644 --- a/include/boost/compute/program.hpp +++ b/include/boost/compute/program.hpp @@ -28,9 +28,9 @@ #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE #include #include -#include #include #include +#include #include #endif @@ -558,41 +558,10 @@ public: private: #ifdef BOOST_COMPUTE_USE_OFFLINE_CACHE - // Path delimiter symbol for the current OS. - static const std::string& path_delim() { - static const std::string delim = - boost::filesystem::path("/").make_preferred().string(); - return delim; - } - - // Path to appdata folder. - static inline const std::string& appdata_path() { -#ifdef WIN32 - static const std::string appdata = detail::getenv("APPDATA") - + path_delim() + "boost_compute"; -#else - static const std::string appdata = detail::getenv("HOME") - + path_delim() + ".boost_compute"; -#endif - return appdata; - } - - // Path to cached binaries. - static std::string program_binary_path(const std::string &hash, bool create = false) - { - std::string dir = appdata_path() + path_delim() - + hash.substr(0, 2) + path_delim() - + hash.substr(2); - - if (create) boost::filesystem::create_directories(dir); - - return dir + path_delim(); - } - // Saves program binaries for future reuse. static void save_program_binary(const std::string &hash, const program &prog) { - std::string fname = program_binary_path(hash, true) + "kernel"; + std::string fname = detail::program_binary_path(hash, true) + "kernel"; std::ofstream bfile(fname.c_str(), std::ios::binary); if (!bfile) return; @@ -608,7 +577,7 @@ private: const std::string &hash, const context &ctx ) { - std::string fname = program_binary_path(hash) + "kernel"; + std::string fname = detail::program_binary_path(hash) + "kernel"; std::ifstream bfile(fname.c_str(), std::ios::binary); if (!bfile) return boost::optional(); diff --git a/include/boost/compute/system.hpp b/include/boost/compute/system.hpp index 5bc1cec3..57996fe2 100644 --- a/include/boost/compute/system.hpp +++ b/include/boost/compute/system.hpp @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -85,7 +84,10 @@ public: /// \throws no_device_found if no device with \p name is found. static device find_device(const std::string &name) { - BOOST_FOREACH(const device &device, devices()){ + const std::vector devices = system::devices(); + for(size_t i = 0; i < devices.size(); i++){ + const device& device = devices[i]; + if(device.name() == name){ return device; } @@ -108,10 +110,13 @@ public: { std::vector devices; - BOOST_FOREACH(const platform &platform, platforms()){ - BOOST_FOREACH(const device &device, platform.devices()){ - devices.push_back(device); - } + const std::vector platforms = system::platforms(); + for(size_t i = 0; i < platforms.size(); i++){ + const std::vector platform_devices = platforms[i].devices(); + + devices.insert( + devices.end(), platform_devices.begin(), platform_devices.end() + ); } return devices; @@ -122,8 +127,9 @@ public: { size_t count = 0; - BOOST_FOREACH(const platform &platform, platforms()){ - count += platform.device_count(); + const std::vector platforms = system::platforms(); + for(size_t i = 0; i < platforms.size(); i++){ + count += platforms[i].device_count(); } return count; @@ -214,7 +220,8 @@ private: const char *vendor = detail::getenv("BOOST_COMPUTE_DEFAULT_VENDOR"); if(name || type || platform || vendor){ - BOOST_FOREACH(const device &device, devices_){ + for(size_t i = 0; i < devices_.size(); i++){ + const device& device = devices_[i]; if (name && !matches(device.name(), name)) continue; @@ -237,14 +244,18 @@ private: } // find the first gpu device - BOOST_FOREACH(const device &device, devices_){ + for(size_t i = 0; i < devices_.size(); i++){ + const device& device = devices_[i]; + if(device.type() == device::gpu){ return device; } } // find the first cpu device - BOOST_FOREACH(const device &device, devices_){ + for(size_t i = 0; i < devices_.size(); i++){ + const device& device = devices_[i]; + if(device.type() == device::cpu){ return device; } diff --git a/include/boost/compute/type_traits/is_fundamental.hpp b/include/boost/compute/type_traits/is_fundamental.hpp index daa19773..fc8f266c 100644 --- a/include/boost/compute/type_traits/is_fundamental.hpp +++ b/include/boost/compute/type_traits/is_fundamental.hpp @@ -33,7 +33,11 @@ struct is_fundamental : public boost::false_type {}; template<> struct is_fundamental : boost::true_type {}; \ template<> struct is_fundamental : boost::true_type {}; \ template<> struct is_fundamental : boost::true_type {}; \ - template<> struct is_fundamental : boost::true_type {}; + template<> struct is_fundamental : boost::true_type {}; \ + template<> struct is_fundamental : boost::true_type {}; \ + template<> struct is_fundamental : boost::true_type {}; \ + template<> struct is_fundamental : boost::true_type {}; \ + template<> struct is_fundamental : boost::true_type {}; BOOST_COMPUTE_DETAIL_DECLARE_FUNDAMENTAL(char) BOOST_COMPUTE_DETAIL_DECLARE_FUNDAMENTAL(uchar) diff --git a/include/boost/compute/types/fundamental.hpp b/include/boost/compute/types/fundamental.hpp index 7c2fc2ee..b5fd3854 100644 --- a/include/boost/compute/types/fundamental.hpp +++ b/include/boost/compute/types/fundamental.hpp @@ -14,7 +14,10 @@ #include #include -#include +#include +#include +#include +#include #include diff --git a/include/boost/compute/types/struct.hpp b/include/boost/compute/types/struct.hpp index 12fa4ef3..9dfff271 100644 --- a/include/boost/compute/types/struct.hpp +++ b/include/boost/compute/types/struct.hpp @@ -15,9 +15,10 @@ #include +#include #include -#include #include +#include #include #include diff --git a/include/boost/compute/types/tuple.hpp b/include/boost/compute/types/tuple.hpp index 900f9f3d..d372e9d9 100644 --- a/include/boost/compute/types/tuple.hpp +++ b/include/boost/compute/types/tuple.hpp @@ -14,6 +14,9 @@ #include #include +#include +#include +#include #include #include @@ -25,8 +28,6 @@ #include #endif -#include - namespace boost { namespace compute { namespace detail { diff --git a/perf/CMakeLists.txt b/perf/CMakeLists.txt index 99dddfed..30d7fcdb 100644 --- a/perf/CMakeLists.txt +++ b/perf/CMakeLists.txt @@ -1,6 +1,6 @@ include_directories(../include) -set(PERF_BOOST_COMPONENTS system timer chrono) +set(PERF_BOOST_COMPONENTS system timer chrono program_options) if (${BOOST_COMPUTE_USE_OFFLINE_CACHE}) set(PERF_BOOST_COMPONENTS ${PERF_BOOST_COMPONENTS} filesystem) @@ -25,15 +25,14 @@ set(BENCHMARKS erase_remove exclusive_scan fill + find find_end includes inner_product is_permutation is_sorted - linear_congruential_engine max_element merge - mersenne_twister next_permutation nth_element partial_sum @@ -41,9 +40,11 @@ set(BENCHMARKS partition_point prev_permutation reverse + reverse_copy rotate rotate_copy host_sort + random_number_engine saxpy search search_n @@ -70,6 +71,7 @@ endforeach() set(STL_BENCHMARKS stl_accumulate stl_count + stl_find stl_find_end stl_includes stl_inner_product @@ -80,6 +82,7 @@ set(STL_BENCHMARKS stl_partition stl_prev_permutation stl_reverse + stl_reverse_copy stl_rotate stl_rotate_copy stl_saxpy @@ -118,11 +121,13 @@ if(${BOOST_COMPUTE_HAVE_CUDA}) thrust_accumulate thrust_count thrust_exclusive_scan + thrust_find thrust_inner_product thrust_merge thrust_partial_sum thrust_partition thrust_reverse + thrust_reverse_copy thrust_rotate thrust_saxpy thrust_set_difference diff --git a/perf/perf.hpp b/perf/perf.hpp index ba97ada9..d755e2fc 100644 --- a/perf/perf.hpp +++ b/perf/perf.hpp @@ -95,4 +95,14 @@ public: std::vector times; }; +// returns the rate (in MB/s) for processing 'count' items of type 'T' +// in 'time' nanoseconds +template +double perf_rate(const size_t count, perf_timer::nanosecond_type time) +{ + const size_t byte_count = count * sizeof(T); + + return (double(byte_count) / 1024 / 1024) / (time / 1e9); +} + #endif // PERF_HPP diff --git a/perf/perf.py b/perf/perf.py index dafd4927..6071b2e2 100755 --- a/perf/perf.py +++ b/perf/perf.py @@ -118,11 +118,13 @@ def run_benchmark(name, sizes, vs=[]): "accumulate", "count", "exclusive_scan", + "find", "inner_product", "merge", "partial_sum", "partition", "reverse", + "reverse_copy", "rotate", "saxpy", "sort", @@ -136,6 +138,7 @@ def run_benchmark(name, sizes, vs=[]): "stl": [ "accumulate", "count", + "find", "find_end", "includes", "inner_product", @@ -149,6 +152,7 @@ def run_benchmark(name, sizes, vs=[]): "partition_point", "prev_permutation", "reverse", + "reverse_copy", "rotate", "rotate_copy", "saxpy", diff --git a/perf/perf_accumulate.cpp b/perf/perf_accumulate.cpp index baacedc0..0b4956f6 100644 --- a/perf/perf_accumulate.cpp +++ b/perf/perf_accumulate.cpp @@ -13,63 +13,128 @@ #include #include +#include + #include #include #include #include "perf.hpp" +namespace po = boost::program_options; +namespace compute = boost::compute; + int rand_int() { return static_cast((rand() / double(RAND_MAX)) * 25.0); } -int main(int argc, char *argv[]) +template +double perf_accumulate(const compute::vector& data, + const size_t trials, + compute::command_queue& queue) { - perf_parse_args(argc, argv); - std::cout << "size: " << PERF_N << std::endl; - - // setup context and queue for the default device - boost::compute::device device = boost::compute::system::default_device(); - boost::compute::context context(device); - boost::compute::command_queue queue(context, device); - std::cout << "device: " << device.name() << std::endl; - - // create vector of random numbers on the host - std::vector host_vector(PERF_N); - std::generate(host_vector.begin(), host_vector.end(), rand_int); - - // create vector on the device and copy the data - boost::compute::vector device_vector(PERF_N, context); - boost::compute::copy( - host_vector.begin(), host_vector.end(), device_vector.begin(), queue - ); - - int sum = 0; perf_timer t; - for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + for(size_t trial = 0; trial < trials; trial++){ t.start(); - // sum vector - sum = boost::compute::accumulate( - device_vector.begin(), device_vector.end(), int(0), queue - ); + compute::accumulate(data.begin(), data.end(), T(0), queue); queue.finish(); t.stop(); } - std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + return t.min_time(); +} - // verify sum is correct - int host_sum = std::accumulate(host_vector.begin(), - host_vector.end(), - int(0)); - if(sum != host_sum){ - std::cout << "ERROR: " - << "device_sum (" << sum << ") " - << "!= " - << "host_sum (" << host_sum << ")" - << std::endl; - return -1; +template +void tune_accumulate(const compute::vector& data, + const size_t trials, + compute::command_queue& queue) +{ + boost::shared_ptr + params = compute::detail::parameter_cache::get_global_cache(queue.get_device()); + + const std::string cache_key = + std::string("__boost_reduce_on_gpu_") + compute::type_name(); + + const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 }; + const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + + double min_time = std::numeric_limits::max(); + compute::uint_ best_tpb = 0; + compute::uint_ best_vpt = 0; + + for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){ + params->set(cache_key, "tpb", tpbs[i]); + for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){ + params->set(cache_key, "vpt", vpts[j]); + + try { + const double t = perf_accumulate(data, trials, queue); + if(t < min_time){ + best_tpb = tpbs[i]; + best_vpt = vpts[j]; + min_time = t; + } + } + catch(compute::opencl_error&){ + // invalid parameters for this device, skip + } + } } + // store optimal parameters + params->set(cache_key, "tpb", best_tpb); + params->set(cache_key, "vpt", best_vpt); +} + +int main(int argc, char *argv[]) +{ + // setup command line arguments + po::options_description options("options"); + options.add_options() + ("help", "show usage instructions") + ("size", po::value()->default_value(8192), "input size") + ("trials", po::value()->default_value(3), "number of trials to run") + ("tune", "run tuning procedure") + ; + po::positional_options_description positional_options; + positional_options.add("size", 1); + + // parse command line + po::variables_map vm; + po::store( + po::command_line_parser(argc, argv) + .options(options).positional(positional_options).run(), + vm + ); + po::notify(vm); + + const size_t size = vm["size"].as(); + const size_t trials = vm["trials"].as(); + std::cout << "size: " << size << std::endl; + + // setup context and queue for the default device + compute::device device = compute::system::default_device(); + compute::context context(device); + compute::command_queue queue(context, device); + std::cout << "device: " << device.name() << std::endl; + + // create vector of random numbers on the host + std::vector host_data(size); + std::generate(host_data.begin(), host_data.end(), rand_int); + + // create vector on the device and copy the data + compute::vector device_data( + host_data.begin(), host_data.end(), queue + ); + + // run tuning proceure (if requested) + if(vm.count("tune")){ + tune_accumulate(device_data, trials, queue); + } + + // run benchmark + double t = perf_accumulate(device_data, trials, queue); + std::cout << "time: " << t / 1e6 << " ms" << std::endl; + return 0; } diff --git a/perf/perf_find.cpp b/perf/perf_find.cpp new file mode 100644 index 00000000..55f5e589 --- /dev/null +++ b/perf/perf_find.cpp @@ -0,0 +1,88 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // setup context and queue for the default device + boost::compute::device device = boost::compute::system::default_device(); + boost::compute::context context(device); + boost::compute::command_queue queue(context, device); + std::cout << "device: " << device.name() << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // create vector on the device and copy the data + boost::compute::vector device_vector(PERF_N, context); + boost::compute::copy( + host_vector.begin(), + host_vector.end(), + device_vector.begin(), + queue + ); + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + + // device iterator + boost::compute::vector::iterator device_result_it; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + device_result_it = boost::compute::find(device_vector.begin(), + device_vector.end(), + wanted, + queue); + queue.finish(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // verify if found index is correct by comparing it with std::find() result + size_t host_result_index = std::distance(host_vector.begin(), + std::find(host_vector.begin(), + host_vector.end(), + wanted)); + size_t device_result_index = device_result_it.get_index(); + + if(device_result_index != host_result_index){ + std::cout << "ERROR: " + << "device_result_index (" << device_result_index << ") " + << "!= " + << "host_result_index (" << host_result_index << ")" + << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_linear_congruential_engine.cpp b/perf/perf_linear_congruential_engine.cpp deleted file mode 100644 index 4edaec64..00000000 --- a/perf/perf_linear_congruential_engine.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2014 Roshan -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -// -// See http://kylelutz.github.com/compute for more information. -//---------------------------------------------------------------------------// - -#include -#include -#include - -#include -#include -#include - -#include "perf.hpp" - -namespace compute = boost::compute; - -int main(int argc, char *argv[]) -{ - perf_parse_args(argc, argv); - std::cout << "size: " << PERF_N << std::endl; - - // setup context and queue for the default device - compute::device device = compute::system::default_device(); - compute::context context(device); - compute::command_queue queue(context, device); - - // create vector on the device - compute::vector vector(PERF_N, context); - - // create mersenne twister engine - compute::linear_congruential_engine rng(queue); - - // generate random numbers - perf_timer t; - t.start(); - rng.generate(vector.begin(), vector.end(), queue); - queue.finish(); - t.stop(); - std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; - - return 0; -} diff --git a/perf/perf_mersenne_twister.cpp b/perf/perf_mersenne_twister.cpp deleted file mode 100644 index 3b1542d6..00000000 --- a/perf/perf_mersenne_twister.cpp +++ /dev/null @@ -1,48 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2013-2014 Kyle Lutz -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -// -// See http://kylelutz.github.com/compute for more information. -//---------------------------------------------------------------------------// - -#include -#include -#include - -#include -#include -#include - -#include "perf.hpp" - -namespace compute = boost::compute; - -int main(int argc, char *argv[]) -{ - perf_parse_args(argc, argv); - std::cout << "size: " << PERF_N << std::endl; - - // setup context and queue for the default device - compute::device device = compute::system::default_device(); - compute::context context(device); - compute::command_queue queue(context, device); - - // create vector on the device - compute::vector vector(PERF_N, context); - - // create mersenne twister engine - compute::mt19937 rng(queue); - - // generate random numbers - perf_timer t; - t.start(); - rng.generate(vector.begin(), vector.end(), queue); - queue.finish(); - t.stop(); - std::cout << "time: " << t.last_time() / 1e6 << " ms" << std::endl; - - return 0; -} diff --git a/perf/perf_random_number_engine.cpp b/perf/perf_random_number_engine.cpp new file mode 100644 index 00000000..1f4ac664 --- /dev/null +++ b/perf/perf_random_number_engine.cpp @@ -0,0 +1,98 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2015 Kyle Lutz +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include + +#include +#include +#include + +#include "perf.hpp" + +namespace compute = boost::compute; +namespace po = boost::program_options; + +template +void perf_random_number_engine(const size_t size, + const size_t trials, + compute::command_queue& queue) +{ + typedef typename Engine::result_type T; + + // create random number engine + Engine engine(queue); + + // create vector on the device + std::cout << "size = " << size << std::endl; + compute::vector vector(size, queue.get_context()); + + // generate random numbers + perf_timer t; + for(size_t i = 0; i < trials; i++){ + t.start(); + engine.generate(vector.begin(), vector.end(), queue); + queue.finish(); + t.stop(); + } + + // print result + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + std::cout << "rate: " << perf_rate(size, t.min_time()) << " MB/s" << std::endl; +} + +int main(int argc, char *argv[]) +{ + // setup and parse command line options + po::options_description options("options"); + options.add_options() + ("help", "show usage instructions") + ("size", po::value()->default_value(8192), "number of values") + ("trials", po::value()->default_value(3), "number of trials") + ("engine", po::value()->default_value("default_random_engine"), "random number engine") + ; + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + if(vm.count("help")) { + std::cout << options << std::endl; + return 0; + } + + // setup context and queue for the default device + compute::device device = compute::system::default_device(); + compute::context context(device); + compute::command_queue queue(context, device); + + // get command line options + const size_t size = vm["size"].as(); + const size_t trials = vm["trials"].as(); + const std::string& engine = vm["engine"].as(); + + // run benchmark + if(engine == "default_random_engine"){ + perf_random_number_engine(size, trials, queue); + } + else if(engine == "mersenne_twister_engine"){ + perf_random_number_engine(size, trials, queue); + } + else if(engine == "linear_congruential_engine"){ + perf_random_number_engine >(size, trials, queue); + } + else { + std::cerr << "error: unknown random number engine '" << engine << "'" << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_reverse_copy.cpp b/perf/perf_reverse_copy.cpp new file mode 100644 index 00000000..60ce7c39 --- /dev/null +++ b/perf/perf_reverse_copy.cpp @@ -0,0 +1,65 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // setup context and queue for the default device + boost::compute::device device = boost::compute::system::default_device(); + boost::compute::context context(device); + boost::compute::command_queue queue(context, device); + std::cout << "device: " << device.name() << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // create vector on the device and copy the data + boost::compute::vector device_vector(PERF_N, context); + boost::compute::copy( + host_vector.begin(), host_vector.end(), device_vector.begin(), queue + ); + + // create vector on the device for reversed data + boost::compute::vector device_reversed_vector(PERF_N, context); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + boost::compute::reverse_copy( + device_vector.begin(), device_vector.end(), + device_reversed_vector.begin(), + queue + ); + queue.finish(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/perf/perf_saxpy.cpp b/perf/perf_saxpy.cpp index 904834c1..0ed99bc3 100644 --- a/perf/perf_saxpy.cpp +++ b/perf/perf_saxpy.cpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include #include @@ -20,85 +22,140 @@ #include "perf.hpp" +namespace po = boost::program_options; +namespace compute = boost::compute; + float rand_float() { return (float(rand()) / float(RAND_MAX)) * 1000.f; } -// y <- alpha * x + y -void serial_saxpy(size_t n, float alpha, const float *x, float *y) +template +double perf_saxpy(const compute::vector& x, + const compute::vector& y, + const T alpha, + const size_t trials, + compute::command_queue& queue) { - for(size_t i = 0; i < n; i++){ - y[i] = alpha * x[i] + y[i]; + // create vector on the device to store the result + compute::vector result(x.size(), queue.get_context()); + + perf_timer t; + for(size_t trial = 0; trial < trials; trial++){ + compute::fill(result.begin(), result.end(), T(0), queue); + + t.start(); + + using compute::lambda::_1; + using compute::lambda::_2; + + compute::transform( + x.begin(), x.end(), y.begin(), result.begin(), alpha * _1 + _2, queue + ); + + queue.finish(); + t.stop(); } + + return t.min_time(); +} + +template +void tune_saxpy(const compute::vector& x, + const compute::vector& y, + const T alpha, + const size_t trials, + compute::command_queue& queue) +{ + boost::shared_ptr + params = compute::detail::parameter_cache::get_global_cache(queue.get_device()); + + const std::string cache_key = + std::string("__boost_copy_kernel_") + boost::lexical_cast(sizeof(T)); + + const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 }; + const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + + double min_time = std::numeric_limits::max(); + compute::uint_ best_tpb = 0; + compute::uint_ best_vpt = 0; + + for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){ + params->set(cache_key, "tpb", tpbs[i]); + for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){ + params->set(cache_key, "vpt", vpts[j]); + + try { + const double t = perf_saxpy(x, y, alpha, trials, queue); + if(t < min_time){ + best_tpb = tpbs[i]; + best_vpt = vpts[j]; + min_time = t; + } + } + catch(compute::opencl_error&){ + // invalid parameters for this device, skip + } + } + } + + // store optimal parameters + params->set(cache_key, "tpb", best_tpb); + params->set(cache_key, "vpt", best_vpt); } int main(int argc, char *argv[]) { - perf_parse_args(argc, argv); + // setup command line arguments + po::options_description options("options"); + options.add_options() + ("help", "show usage instructions") + ("size", po::value()->default_value(8192), "input size") + ("trials", po::value()->default_value(3), "number of trials to run") + ("tune", "run tuning procedure") + ("alpha", po::value()->default_value(2.5), "saxpy alpha value") + ; + po::positional_options_description positional_options; + positional_options.add("size", 1); - using boost::compute::lambda::_1; - using boost::compute::lambda::_2; + // parse command line + po::variables_map vm; + po::store( + po::command_line_parser(argc, argv) + .options(options).positional(positional_options).run(), + vm + ); + po::notify(vm); - std::cout << "size: " << PERF_N << std::endl; - - float alpha = 2.5f; + const size_t size = vm["size"].as(); + const size_t trials = vm["trials"].as(); + const float alpha = vm["alpha"].as(); + std::cout << "size: " << size << std::endl; // setup context and queue for the default device - boost::compute::device device = boost::compute::system::default_device(); - boost::compute::context context(device); - boost::compute::command_queue queue(context, device); + compute::device device = boost::compute::system::default_device(); + compute::context context(device); + compute::command_queue queue(context, device); std::cout << "device: " << device.name() << std::endl; // create vector of random numbers on the host - std::vector host_x(PERF_N); - std::vector host_y(PERF_N); + std::vector host_x(size); + std::vector host_y(size); std::generate(host_x.begin(), host_x.end(), rand_float); std::generate(host_y.begin(), host_y.end(), rand_float); // create vector on the device and copy the data - boost::compute::vector device_x(host_x.begin(), host_x.end(), queue); - boost::compute::vector device_y(host_y.begin(), host_y.end(), queue); + compute::vector x(host_x.begin(), host_x.end(), queue); + compute::vector y(host_y.begin(), host_y.end(), queue); - perf_timer t; - for(size_t trial = 0; trial < PERF_TRIALS; trial++){ - boost::compute::copy(host_x.begin(), host_x.end(), device_x.begin(), queue); - boost::compute::copy(host_y.begin(), host_y.end(), device_y.begin(), queue); - - t.start(); - boost::compute::transform( - device_x.begin(), - device_x.end(), - device_y.begin(), - device_y.begin(), - alpha * _1 + _2, - queue - ); - queue.finish(); - t.stop(); + // run tuning proceure (if requested) + if(vm.count("tune")){ + tune_saxpy(x, y, alpha, trials, queue); } - std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; - // perform saxpy on host - serial_saxpy(PERF_N, alpha, &host_x[0], &host_y[0]); - - // copy device_y to host_x - boost::compute::copy(device_y.begin(), device_y.end(), host_x.begin(), queue); - - for(size_t i = 0; i < PERF_N; i++){ - float host_value = host_y[i]; - float device_value = host_x[i]; - - if(std::abs(device_value - host_value) > 1e-3){ - std::cout << "ERROR: " - << "value at " << i << " " - << "device_value (" << device_value << ") " - << "!= " - << "host_value (" << host_value << ")" - << std::endl; - return -1; - } - } + // run benchmark + double t = perf_saxpy(x, y, alpha, trials, queue); + std::cout << "time: " << t / 1e6 << " ms" << std::endl; return 0; } diff --git a/perf/perf_sort.cpp b/perf/perf_sort.cpp index a5a677db..92b66f6b 100644 --- a/perf/perf_sort.cpp +++ b/perf/perf_sort.cpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include #include @@ -19,53 +21,110 @@ #include "perf.hpp" +namespace po = boost::program_options; +namespace compute = boost::compute; + +template +double perf_sort(const std::vector& data, + const size_t trials, + compute::command_queue& queue) +{ + compute::vector vec(data.size(), queue.get_context()); + + perf_timer t; + for(size_t trial = 0; trial < trials; trial++){ + compute::copy(data.begin(), data.end(), vec.begin(), queue); + t.start(); + compute::sort(vec.begin(), vec.end(), queue); + queue.finish(); + t.stop(); + + if(!compute::is_sorted(vec.begin(), vec.end(), queue)){ + std::cerr << "ERROR: is_sorted() returned false" << std::endl; + } + } + return t.min_time(); +} + +template +void tune_sort(const std::vector& data, + const size_t trials, + compute::command_queue& queue) +{ + boost::shared_ptr + params = compute::detail::parameter_cache::get_global_cache(queue.get_device()); + + const std::string cache_key = + std::string("__boost_radix_sort_") + compute::type_name(); + + const compute::uint_ tpbs[] = { 32, 64, 128, 256, 512, 1024 }; + + double min_time = std::numeric_limits::max(); + compute::uint_ best_tpb = 0; + + for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){ + params->set(cache_key, "tpb", tpbs[i]); + + try { + const double t = perf_sort(data, trials, queue); + if(t < min_time){ + best_tpb = tpbs[i]; + min_time = t; + } + } + catch(compute::opencl_error&){ + // invalid work group size for this device, skip + } + } + + // store optimal parameters + params->set(cache_key, "tpb", best_tpb); +} + int main(int argc, char *argv[]) { - perf_parse_args(argc, argv); + // setup command line arguments + po::options_description options("options"); + options.add_options() + ("help", "show usage instructions") + ("size", po::value()->default_value(8192), "input size") + ("trials", po::value()->default_value(3), "number of trials to run") + ("tune", "run tuning procedure") + ; + po::positional_options_description positional_options; + positional_options.add("size", 1); - std::cout << "size: " << PERF_N << std::endl; + // parse command line + po::variables_map vm; + po::store( + po::command_line_parser(argc, argv) + .options(options).positional(positional_options).run(), + vm + ); + po::notify(vm); + + const size_t size = vm["size"].as(); + const size_t trials = vm["trials"].as(); + std::cout << "size: " << size << std::endl; // setup context and queue for the default device - boost::compute::device device = boost::compute::system::default_device(); - boost::compute::context context(device); - boost::compute::command_queue queue(context, device); + compute::device device = boost::compute::system::default_device(); + compute::context context(device); + compute::command_queue queue(context, device); std::cout << "device: " << device.name() << std::endl; // create vector of random numbers on the host - std::vector host_vector(PERF_N); - std::generate(host_vector.begin(), host_vector.end(), rand); + std::vector data(size); + std::generate(data.begin(), data.end(), rand); - // create vector on the device and copy the data - boost::compute::vector device_vector(PERF_N, context); - - perf_timer t; - for(size_t trial = 0; trial < PERF_TRIALS; trial++){ - boost::compute::copy( - host_vector.begin(), - host_vector.end(), - device_vector.begin(), - queue - ); - - t.start(); - // sort vector - boost::compute::sort( - device_vector.begin(), - device_vector.end(), - queue - ); - queue.finish(); - t.stop(); + // run tuning proceure (if requested) + if(vm.count("tune")){ + tune_sort(data, trials, queue); } - std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; - // verify vector is sorted - if(!boost::compute::is_sorted(device_vector.begin(), - device_vector.end(), - queue)){ - std::cout << "ERROR: is_sorted() returned false" << std::endl; - return -1; - } + // run sort benchmark + double t = perf_sort(data, trials, queue); + std::cout << "time: " << t / 1e6 << " ms" << std::endl; return 0; } diff --git a/perf/perf_stl_find.cpp b/perf/perf_stl_find.cpp new file mode 100644 index 00000000..b2945d9c --- /dev/null +++ b/perf/perf_stl_find.cpp @@ -0,0 +1,58 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + + // result + std::vector::iterator host_result_it; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + host_result_it = std::find(host_vector.begin(), host_vector.end(), wanted); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // verify + if(host_result_it != host_vector.end()){ + std::cout << "ERROR: " + << "host_result_iterator != " + << "host_vector.end()" + << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_stl_reverse_copy.cpp b/perf/perf_stl_reverse_copy.cpp new file mode 100644 index 00000000..4503c138 --- /dev/null +++ b/perf/perf_stl_reverse_copy.cpp @@ -0,0 +1,45 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // create vector for reversed data + std::vector host_reversed_vector(PERF_N); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + std::reverse_copy(host_vector.begin(), host_vector.end(), + host_reversed_vector.begin()); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/perf/perf_thrust_find.cu b/perf/perf_thrust_find.cu new file mode 100644 index 00000000..a0e89c9a --- /dev/null +++ b/perf/perf_thrust_find.cu @@ -0,0 +1,65 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +// Max integer that can be generated by rand_int() function. +int rand_int_max = 25; + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * rand_int_max); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + std::cout << "size: " << PERF_N << std::endl; + + // create vector of random numbers on the host + thrust::host_vector host_vector(PERF_N); + thrust::generate(host_vector.begin(), host_vector.end(), rand_int); + + thrust::device_vector v = host_vector; + + // trying to find element that isn't in vector (worst-case scenario) + int wanted = rand_int_max + 1; + + // result + thrust::device_vector::iterator device_result_it; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + device_result_it = thrust::find(v.begin(), v.end(), wanted); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // verify + if(device_result_it != v.end()){ + std::cout << "ERROR: " + << "device_result_iterator != " + << "v.end()" + << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_thrust_reverse_copy.cu b/perf/perf_thrust_reverse_copy.cu new file mode 100644 index 00000000..2a34f93e --- /dev/null +++ b/perf/perf_thrust_reverse_copy.cu @@ -0,0 +1,46 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://kylelutz.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.hpp" + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec = generate_random_vector(PERF_N); + + // transfer data to the device + thrust::device_vector d_vec; + d_vec = h_vec; + + // device vector for reversed data + thrust::device_vector d_reversed_vec(PERF_N); + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + t.start(); + thrust::reverse_copy(d_vec.begin(), d_vec.end(), d_reversed_vec.begin()); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + return 0; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ac27ff20..3f717ac6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -117,6 +117,7 @@ add_compute_test("algorithm.sort_by_key" test_sort_by_key.cpp) add_compute_test("algorithm.stable_partition" test_stable_partition.cpp) add_compute_test("algorithm.stable_sort" test_stable_sort.cpp) add_compute_test("algorithm.transform" test_transform.cpp) +add_compute_test("algorithm.transform_if" test_transform_if.cpp) add_compute_test("algorithm.transform_reduce" test_transform_reduce.cpp) add_compute_test("algorithm.unique" test_unique.cpp) add_compute_test("algorithm.unique_copy" test_unique_copy.cpp) @@ -190,7 +191,6 @@ add_compute_test("experimental.clamp_range" test_clamp_range.cpp) add_compute_test("experimental.malloc" test_malloc.cpp) add_compute_test("experimental.sort_by_transform" test_sort_by_transform.cpp) add_compute_test("experimental.tabulate" test_tabulate.cpp) -add_compute_test("experimental.transform_if" test_transform_if.cpp) # miscellaneous tests add_compute_test("misc.amd_cpp_kernel_language" test_amd_cpp_kernel_language.cpp) diff --git a/test/test_reverse.cpp b/test/test_reverse.cpp index ea96358a..f2dcc557 100644 --- a/test/test_reverse.cpp +++ b/test/test_reverse.cpp @@ -68,6 +68,14 @@ BOOST_AUTO_TEST_CASE(reverse_copy_int) bc::reverse_copy(a.begin(), a.end(), b.begin()); BOOST_CHECK(iter == b.end()); CHECK_RANGE_EQUAL(int, 5, b, (4, 3, 2, 1, 0)); + + iter = bc::reverse_copy(b.begin() + 1, b.end(), a.begin() + 1); + BOOST_CHECK(iter == a.end()); + CHECK_RANGE_EQUAL(int, 5, a, (0, 0, 1, 2, 3)); + + iter = bc::reverse_copy(a.begin(), a.end() - 1, b.begin()); + BOOST_CHECK(iter == (b.end() - 1)); + CHECK_RANGE_EQUAL(int, 5, b, (2, 1, 0, 0, 0)); } BOOST_AUTO_TEST_CASE(reverse_copy_counting_iterator) diff --git a/test/test_transform.cpp b/test/test_transform.cpp index 7054b03a..ecd78cc1 100644 --- a/test/test_transform.cpp +++ b/test/test_transform.cpp @@ -298,4 +298,27 @@ boost::compute::transform( CHECK_RANGE_EQUAL(int, 4, vec, (1, 2, 3, 4)); } +BOOST_AUTO_TEST_CASE(abs_if_odd) +{ + // return absolute value only for odd values + BOOST_COMPUTE_FUNCTION(int, abs_if_odd, (int x), + { + if(x & 1){ + return abs(x); + } + else { + return x; + } + }); + + int data[] = { -2, -3, -4, -5, -6, -7, -8, -9 }; + compute::vector vector(data, data + 8, queue); + + compute::transform( + vector.begin(), vector.end(), vector.begin(), abs_if_odd, queue + ); + + CHECK_RANGE_EQUAL(int, 8, vector, (-2, +3, -4, +5, -6, +7, -8, +9)); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/test/test_transform_if.cpp b/test/test_transform_if.cpp index ba738a25..c6c814bb 100644 --- a/test/test_transform_if.cpp +++ b/test/test_transform_if.cpp @@ -12,8 +12,7 @@ #include #include -#include -#include +#include #include #include "check_macros.hpp" @@ -21,26 +20,20 @@ namespace compute = boost::compute; -BOOST_AUTO_TEST_CASE(abs_if_odd) +BOOST_AUTO_TEST_CASE(transform_if_odd) { - using compute::lambda::_1; + using boost::compute::abs; + using boost::compute::lambda::_1; - // input data int data[] = { -2, -3, -4, -5, -6, -7, -8, -9 }; compute::vector vector(data, data + 8, queue); - // calculate absolute value only for odd values - compute::experimental::transform_if( - vector.begin(), - vector.end(), - vector.begin(), - compute::abs(), - _1 % 2 != 0, - queue + compute::vector::iterator end = compute::transform_if( + vector.begin(), vector.end(), vector.begin(), abs(), _1 % 2 != 0, queue ); + BOOST_CHECK_EQUAL(std::distance(vector.begin(), end), 4); - // check transformed values - CHECK_RANGE_EQUAL(int, 8, vector, (-2, +3, -4, +5, -6, +7, -8, +9)); + CHECK_RANGE_EQUAL(int, 4, vector, (+3, +5, +7, +9)); } BOOST_AUTO_TEST_SUITE_END()