From 4fdef7756e48169a1bb01f61e4b91e8a6001f4ce Mon Sep 17 00:00:00 2001 From: Benoit Dequidt Date: Tue, 6 May 2014 23:59:45 +0800 Subject: [PATCH 1/4] Creation of simple_moving_average example --- example/CMakeLists.txt | 1 + example/simple_moving_average.cpp | 133 ++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 example/simple_moving_average.cpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index e580775c..42e22cac 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -20,6 +20,7 @@ set(EXAMPLES time_copy transform_sqrt vector_addition + simple_moving_average ) if (${BOOST_COMPUTE_USE_OFFLINE_CACHE}) diff --git a/example/simple_moving_average.cpp b/example/simple_moving_average.cpp new file mode 100644 index 00000000..393c12f8 --- /dev/null +++ b/example/simple_moving_average.cpp @@ -0,0 +1,133 @@ + + +#include +#include + +#include +#include + +namespace compute = boost::compute; + + + +/// warning precision is not precise due +/// to the float error accumulation when size is large enough +/// for more precision use double +/// or a kahan sum else results can diverge +/// from the CPU implementation +compute::program _sma_program(const compute::context& context) +{ + const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( + __kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize) + { + const int gid = get_global_id(0); + + float cumValues = 0.; + int endIdx = gid + wSize/2; + int startIdx = gid -1 - wSize/2; + + if(endIdx > size -1) + endIdx = size -1; + + cumValues += scannedValues[endIdx]; + if(startIdx < 0) + startIdx = -1; + else + cumValues -= scannedValues[startIdx]; + + output[gid] =(float)( cumValues / ( float )(endIdx - startIdx)); + } + + ); + + // create sma program + compute::program program = compute::program::build_with_source(source,context); + return program; +} + + +bool _check(const std::vector& values, const std::vector& smoothValues, unsigned int wSize) +{ + int size = values.size(); + if(size != (int)smoothValues.size()) return false; + + int semiWidth = wSize/2; + + bool res = true; + for(int idx = 0 ; idx < size ; ++idx) + { + int start = std::max(idx - semiWidth,0); + int end = std::min(idx + semiWidth,size-1); + float res = 0; + for(int j = start ; j <= end ; ++j) + { + res+= values[j]; + } + + res /= float(end - start +1); + + if(std::abs(res-smoothValues[idx]) > 1e-3) + { + std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl; + res = false; + } + } + + return res; +} + + +// generate a uniform law over [0,10[ +float myRand() +{ + static const double divisor = double(RAND_MAX)+1.; + return double(rand())/divisor * 10.; +} + +int main() +{ + + unsigned int size = 1024; + // wSize must be odd + unsigned int wSize = 21; + // get the default device + compute::device device = compute::system::default_device(); + // create a context for the device + compute::context context(device); + // get the program + compute::program program = _sma_program(context); + + // create vector of random numbers on the host + std::vector host_vector(size); + std::vector host_result(size); + std::generate(host_vector.begin(), host_vector.end(), myRand); + + compute::vector a(size,context); + compute::vector b(size,context); + compute::vector c(size,context); + compute::command_queue queue(context, device); + + compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue); + + // scan values + compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue); + // sma kernel + compute::kernel kernel(program, "SMA"); + kernel.set_arg(0,b.get_buffer()); + kernel.set_arg(1,(int)b.size()); + kernel.set_arg(2,c.get_buffer()); + kernel.set_arg(3,(int)wSize); + + uint tpb = 128; + uint workSize = size; + queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb); + + compute::copy(c.begin(),c.end(),host_result.begin(),queue); + + bool res = _check(host_vector,host_result,wSize); + std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's"; + std::cout << status << std::endl; + + return 0; +} + From 8587c1b9d96101031d4c53bb73e39a44de8fbd47 Mon Sep 17 00:00:00 2001 From: Benoit Dequidt Date: Wed, 14 May 2014 00:31:24 +0800 Subject: [PATCH 2/4] creation of matrix_transpose example --- example/CMakeLists.txt | 1 + example/matrix_transpose.cpp | 321 +++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+) create mode 100644 example/matrix_transpose.cpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 42e22cac..f0ecf268 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -21,6 +21,7 @@ set(EXAMPLES transform_sqrt vector_addition simple_moving_average + matrix_transpose ) if (${BOOST_COMPUTE_USE_OFFLINE_CACHE}) diff --git a/example/matrix_transpose.cpp b/example/matrix_transpose.cpp new file mode 100644 index 00000000..43c08031 --- /dev/null +++ b/example/matrix_transpose.cpp @@ -0,0 +1,321 @@ +#include +#include + +#include +#include + +namespace compute = boost::compute; + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +/// \fn _copyKernel +/// \brief generate a copy kernel program +compute::program _copyKernel(const compute::context& context) +{ + const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( + __kernel void copy_kernel(__global const float *src, __global float *dst) + { + uint x = get_group_id(0) * TILE_DIM + get_local_id(0); + uint y = get_group_id(1) * TILE_DIM + get_local_id(1); + + uint width = get_num_groups(0) * TILE_DIM; + + for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS) + { + dst[(y+i)*width +x] = src[(y+i)*width + x]; + } + } + ); + // create copy program + std::stringstream options; + options << "-DTILE_DIM=" << TILE_DIM + << " -DBLOCK_ROWS=" << BLOCK_ROWS; + compute::program program = compute::program::build_with_source(source,context,options.str()); + return program; +} + +/// \fn _naiveTransposeKernel +/// \brief generate a naive transpose kernel program +compute::program _naiveTransposeKernel(const compute::context& context) +{ + const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( + __kernel void naiveTranspose(__global const float *src, __global float *dst) + { + uint x = get_group_id(0) * TILE_DIM + get_local_id(0); + uint y = get_group_id(1) * TILE_DIM + get_local_id(1); + + uint width = get_num_groups(0) * TILE_DIM; + + for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS) + { + dst[x*width + y+i] = src[(y+i)*width + x]; + } + } + ); + + // create naiveTranspose program + std::stringstream options; + options << "-DTILE_DIM=" << TILE_DIM + << " -DBLOCK_ROWS=" << BLOCK_ROWS; + compute::program program = compute::program::build_with_source(source,context,options.str()); + return program; +} + +/// \fn _coalescedTransposeKernel +/// \brief generate a coalesced transpose kernel program +compute::program _coalescedTransposeKernel(const compute::context& context) +{ + const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( + __kernel void coalescedTranspose(__global const float *src, __global float *dst) + { + __local float tile[TILE_DIM][TILE_DIM]; + + // compute indexes + uint x = get_group_id(0) * TILE_DIM + get_local_id(0); + uint y = get_group_id(1) * TILE_DIM + get_local_id(1); + + uint width = get_num_groups(0) * TILE_DIM; + + // load inside local memory + for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS) + { + tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // transpose indexes + x = get_group_id(1) * TILE_DIM + get_local_id(0); + y = get_group_id(0) * TILE_DIM + get_local_id(1); + + // write output from local memory + for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS) + { + dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i]; + } + + } + ); + + // create naiveTranspose program + std::stringstream options; + options << "-DTILE_DIM=" << TILE_DIM + << " -DBLOCK_ROWS=" << BLOCK_ROWS; + compute::program program = compute::program::build_with_source(source,context,options.str()); + return program; +} + +/// \fn _coalescedNoBankConflictsKernel +/// \brief generate a coalesced withtout bank conflicts kernel program +compute::program _coalescedNoBankConflictsKernel(const compute::context& context) +{ + const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( + __kernel void coalescedNoBankConflicts(__global const float *src, __global float *dst) + { + // TILE_DIM+1 is here to avoid bank conflicts in local memory + __local float tile[TILE_DIM][TILE_DIM+1]; + + // compute indexes + uint x = get_group_id(0) * TILE_DIM + get_local_id(0); + uint y = get_group_id(1) * TILE_DIM + get_local_id(1); + + uint width = get_num_groups(0) * TILE_DIM; + + // load inside local memory + for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS) + { + tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // transpose indexes + x = get_group_id(1) * TILE_DIM + get_local_id(0); + y = get_group_id(0) * TILE_DIM + get_local_id(1); + // write output from local memory + for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS) + { + dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i]; + } + + } + ); + + // create naiveTranspose program + std::stringstream options; + options << "-DTILE_DIM=" << TILE_DIM + << " -DBLOCK_ROWS=" << BLOCK_ROWS; + compute::program program = compute::program::build_with_source(source,context,options.str()); + return program; +} + +/// \fn _checkTransposition +/// \brief Compare @a expectedResult to @a transposedMatrix +bool _checkTransposition(const std::vector& expectedResult, + uint size, + const std::vector& transposedMatrix) +{ + for(uint i = 0 ; i < size ; ++i) + { + if(expectedResult[i] != transposedMatrix[i]) + { + std::cout << "idx = " << i << " , expected " << expectedResult[i] << + " , got " << transposedMatrix[i] << std::endl; + std::cout << "FAILED" << std::endl; + return false; + } + } + return true; +} + +/// \fn _generateMatrix +/// \brief generate a matrix inside @a in and do the tranposition inside @a transposeRef +void _generateMatrix(std::vector& in, std::vector& transposeRef, uint nx, uint ny) +{ + // generate a matrix + for(uint i = 0 ; i < nx ; ++i) + { + for(uint j = 0 ; j < ny ; ++j) + { + in[i*ny + j] = i*ny + j; + } + } + + // store transposed result + for(uint j = 0; j < ny ; ++j) + { + for(uint i = 0 ; i < nx ; ++i) + { + transposeRef[j*nx + i] = in[i*ny + j]; + } + } +} + +#define _BEGIN_TEST(name) std::cout << name << std::endl; +#define _END std::cout << std::endl; + + +int main() +{ + const uint nx = 4096; + const uint ny = 4096; + + std::cout << "Matrix Size: " << nx << "x" << ny << std::endl; + std::cout << "Grid Size: " << nx/TILE_DIM << "x" << ny/TILE_DIM << " blocks" << std::endl; + std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl; + + std::cout << std::endl; + + const size_t g_workSize[2] = {nx,ny*BLOCK_ROWS/TILE_DIM}; + const size_t l_workSize[2] = {TILE_DIM,BLOCK_ROWS}; + + const uint size = nx * ny; + + std::vector h_input(size); + std::vector h_output(size); + std::vector expectedResult(size); + _generateMatrix(h_input,expectedResult,nx,ny); + + // get the default device + compute::device device = compute::system::default_device(); + + // create a context for the device + compute::context context(device); + + // device vectors + compute::vector d_input(size,context); + compute::vector d_output(size,context); + + // command_queue with profiling + compute::command_queue queue(context, device,compute::command_queue::enable_profiling); + + // copy input data + compute::copy(h_input.begin(),h_input.end(),d_input.begin(),queue); + + compute::program copy_program = _copyKernel(context); + compute::kernel kernel(copy_program,"copy_kernel"); + kernel.set_arg(0,d_input); + kernel.set_arg(1,d_output); + + compute::event start; + _BEGIN_TEST("Copy_Kernel"); + start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize); + queue.finish(); + uint64_t elapsed = start.duration().count(); + + std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl; + std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl; + compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue); + + if(_checkTransposition(h_input,nx*ny,h_output)) + std::cout << "\tStatus: Success" << std::endl; + else + std::cout << "\tStatus: Error" << std::endl; + _END + + _BEGIN_TEST("naiveTranspose") + + kernel = compute::kernel(_naiveTransposeKernel(context),"naiveTranspose"); + kernel.set_arg(0,d_input); + kernel.set_arg(1,d_output); + + start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize); + queue.finish(); + elapsed = start.duration().count(); + std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl; + std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl; + compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue); + + if(_checkTransposition(expectedResult,nx*ny,h_output)) + std::cout << "\tStatus: Success" << std::endl; + else + std::cout << "\tStatus: Error" << std::endl; + _END + + _BEGIN_TEST("coalescedTranspose") + + kernel = compute::kernel(_coalescedTransposeKernel(context),"coalescedTranspose"); + kernel.set_arg(0,d_input); + kernel.set_arg(1,d_output); + + start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize); + queue.finish(); + elapsed = start.duration().count(); + std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl; + std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl; + + + compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue); + + if(_checkTransposition(expectedResult,nx*ny,h_output)) + std::cout << "\tStatus: Success" << std::endl; + else + std::cout << "\tStatus: Error" << std::endl; + _END + + _BEGIN_TEST("coalescedNoBankConflicts") + + kernel = compute::kernel(_coalescedNoBankConflictsKernel(context),"coalescedNoBankConflicts"); + kernel.set_arg(0,d_input); + kernel.set_arg(1,d_output); + + start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize); + queue.finish(); + elapsed = start.duration().count(); + std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl; + std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl; + + + compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue); + + if(_checkTransposition(expectedResult,nx*ny,h_output)) + std::cout << "\tStatus: Success" << std::endl; + else + std::cout << "\tStatus: Error" << std::endl; + _END + + return 0; +} + + From 48687cd29e5422c8c341466bef83059ecd69fae2 Mon Sep 17 00:00:00 2001 From: Benoit Dequidt Date: Wed, 14 May 2014 23:54:58 +0800 Subject: [PATCH 3/4] Add perf tests for exclusive_scan - perf_exclusive_scan for Boost.Compute - perf_thrust_exclusive_scan for thrust --- perf/CMakeLists.txt | 2 + perf/perf_exclusive_scan.cpp | 87 ++++++++++++++++++++++++++++++ perf/perf_thrust_exclusive_scan.cu | 37 +++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 perf/perf_exclusive_scan.cpp create mode 100644 perf/perf_thrust_exclusive_scan.cu diff --git a/perf/CMakeLists.txt b/perf/CMakeLists.txt index eacd1823..8793a208 100644 --- a/perf/CMakeLists.txt +++ b/perf/CMakeLists.txt @@ -29,6 +29,7 @@ set(BENCHMARKS sort_float unique unique_copy + exclusive_scan ) foreach(BENCHMARK ${BENCHMARKS}) @@ -70,6 +71,7 @@ if(${BOOST_COMPUTE_HAVE_CUDA}) thrust_partial_sum thrust_saxpy thrust_sort + thrust_exclusive_scan ) foreach(BENCHMARK ${CUDA_BENCHMARKS}) diff --git a/perf/perf_exclusive_scan.cpp b/perf/perf_exclusive_scan.cpp new file mode 100644 index 00000000..37fe1149 --- /dev/null +++ b/perf/perf_exclusive_scan.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "perf.hpp" + +int rand_int() +{ + return static_cast((rand() / double(RAND_MAX)) * 25.0); +} + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + + // setup context and queue for the default device + boost::compute::device device = boost::compute::system::default_device(); + boost::compute::context context(device); + boost::compute::command_queue queue(context, device); + std::cout << "device: " << device.name() << std::endl; + + // create vector of random numbers on the host + std::vector host_vector(PERF_N); + std::generate(host_vector.begin(), host_vector.end(), rand_int); + + // create vector on the device and copy the data + boost::compute::vector device_vector(PERF_N, context); + boost::compute::vector device_res(PERF_N,context); + boost::compute::copy( + host_vector.begin(), + host_vector.end(), + device_vector.begin(), + queue + ); + + // sum vector + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + boost::compute::copy( + host_vector.begin(), + host_vector.end(), + device_vector.begin(), + queue + ); + + t.start(); + boost::compute::exclusive_scan( + device_vector.begin(), + device_vector.end(), + device_res.begin(), + queue + ); + queue.finish(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // verify sum is correct + std::partial_sum( + host_vector.begin(), + host_vector.end(), + host_vector.begin() + ); + + int device_sum = device_res.back(); + // when scan is exclusive values are shifted by one on the left + // compared to a inclusive scan + int host_sum = host_vector[host_vector.size()-2]; + + if(device_sum != host_sum){ + std::cout << "ERROR: " + << "device_sum (" << device_sum << ") " + << "!= " + << "host_sum (" << host_sum << ")" + << std::endl; + return -1; + } + + return 0; +} diff --git a/perf/perf_thrust_exclusive_scan.cu b/perf/perf_thrust_exclusive_scan.cu new file mode 100644 index 00000000..1e4631e0 --- /dev/null +++ b/perf/perf_thrust_exclusive_scan.cu @@ -0,0 +1,37 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "perf.hpp" + +int main(int argc, char *argv[]) +{ + perf_parse_args(argc, argv); + + std::cout << "size: " << PERF_N << std::endl; + thrust::host_vector h_vec = generate_random_vector(PERF_N); + + // transfer data to the device + thrust::device_vector d_vec = h_vec; + + perf_timer t; + for(size_t trial = 0; trial < PERF_TRIALS; trial++){ + d_vec = h_vec; + + t.start(); + thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin()); + cudaDeviceSynchronize(); + t.stop(); + } + std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; + + // transfer data back to host + thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); + + return 0; +} \ No newline at end of file From fed05b3ba5c17918b83f39976e3810682ca09cc8 Mon Sep 17 00:00:00 2001 From: Benoit Dequidt Date: Wed, 14 May 2014 23:55:14 +0800 Subject: [PATCH 4/4] Update perf_partial_sum : no internal allocation and copy Calling the partial_sum algorithm with the same input/output buffer leads to an internal allocation and copy. --- perf/perf_partial_sum.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/perf/perf_partial_sum.cpp b/perf/perf_partial_sum.cpp index 1ceef4c7..cea1df2a 100644 --- a/perf/perf_partial_sum.cpp +++ b/perf/perf_partial_sum.cpp @@ -42,6 +42,7 @@ int main(int argc, char *argv[]) // create vector on the device and copy the data boost::compute::vector device_vector(PERF_N, context); + boost::compute::vector device_res(PERF_N,context); boost::compute::copy( host_vector.begin(), host_vector.end(), @@ -63,7 +64,7 @@ int main(int argc, char *argv[]) boost::compute::partial_sum( device_vector.begin(), device_vector.end(), - device_vector.begin(), + device_res.begin(), queue ); queue.finish(); @@ -78,7 +79,7 @@ int main(int argc, char *argv[]) host_vector.begin() ); - int device_sum = device_vector.back(); + int device_sum = device_res.back(); int host_sum = host_vector.back(); if(device_sum != host_sum){