2
0
mirror of https://github.com/boostorg/compute.git synced 2026-01-27 18:52:15 +00:00

Merge pull request #111 from banche/develop

matrix transpose and simple moving average examples + exclusive_scan perf tests
This commit is contained in:
Kyle Lutz
2014-05-14 12:10:20 -06:00
7 changed files with 585 additions and 2 deletions

View File

@@ -20,6 +20,8 @@ set(EXAMPLES
time_copy
transform_sqrt
vector_addition
simple_moving_average
matrix_transpose
)
if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})

View File

@@ -0,0 +1,321 @@
#include <iostream>
#include <cstdlib>
#include <boost/compute.hpp>
#include <boost/compute/type_traits/type_name.hpp>
namespace compute = boost::compute;
#define TILE_DIM 32
#define BLOCK_ROWS 8
/// \fn _copyKernel
/// \brief generate a copy kernel program
compute::program _copyKernel(const compute::context& context)
{
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void copy_kernel(__global const float *src, __global float *dst)
{
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
uint width = get_num_groups(0) * TILE_DIM;
for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
{
dst[(y+i)*width +x] = src[(y+i)*width + x];
}
}
);
// create copy program
std::stringstream options;
options << "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
compute::program program = compute::program::build_with_source(source,context,options.str());
return program;
}
/// \fn _naiveTransposeKernel
/// \brief generate a naive transpose kernel program
compute::program _naiveTransposeKernel(const compute::context& context)
{
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void naiveTranspose(__global const float *src, __global float *dst)
{
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
uint width = get_num_groups(0) * TILE_DIM;
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
{
dst[x*width + y+i] = src[(y+i)*width + x];
}
}
);
// create naiveTranspose program
std::stringstream options;
options << "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
compute::program program = compute::program::build_with_source(source,context,options.str());
return program;
}
/// \fn _coalescedTransposeKernel
/// \brief generate a coalesced transpose kernel program
compute::program _coalescedTransposeKernel(const compute::context& context)
{
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void coalescedTranspose(__global const float *src, __global float *dst)
{
__local float tile[TILE_DIM][TILE_DIM];
// compute indexes
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
uint width = get_num_groups(0) * TILE_DIM;
// load inside local memory
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
{
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
}
barrier(CLK_LOCAL_MEM_FENCE);
// transpose indexes
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
// write output from local memory
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
{
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
}
}
);
// create naiveTranspose program
std::stringstream options;
options << "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
compute::program program = compute::program::build_with_source(source,context,options.str());
return program;
}
/// \fn _coalescedNoBankConflictsKernel
/// \brief generate a coalesced withtout bank conflicts kernel program
compute::program _coalescedNoBankConflictsKernel(const compute::context& context)
{
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void coalescedNoBankConflicts(__global const float *src, __global float *dst)
{
// TILE_DIM+1 is here to avoid bank conflicts in local memory
__local float tile[TILE_DIM][TILE_DIM+1];
// compute indexes
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
uint width = get_num_groups(0) * TILE_DIM;
// load inside local memory
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
{
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
}
barrier(CLK_LOCAL_MEM_FENCE);
// transpose indexes
x = get_group_id(1) * TILE_DIM + get_local_id(0);
y = get_group_id(0) * TILE_DIM + get_local_id(1);
// write output from local memory
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
{
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
}
}
);
// create naiveTranspose program
std::stringstream options;
options << "-DTILE_DIM=" << TILE_DIM
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
compute::program program = compute::program::build_with_source(source,context,options.str());
return program;
}
/// \fn _checkTransposition
/// \brief Compare @a expectedResult to @a transposedMatrix
bool _checkTransposition(const std::vector<float>& expectedResult,
uint size,
const std::vector<float>& transposedMatrix)
{
for(uint i = 0 ; i < size ; ++i)
{
if(expectedResult[i] != transposedMatrix[i])
{
std::cout << "idx = " << i << " , expected " << expectedResult[i] <<
" , got " << transposedMatrix[i] << std::endl;
std::cout << "FAILED" << std::endl;
return false;
}
}
return true;
}
/// \fn _generateMatrix
/// \brief generate a matrix inside @a in and do the tranposition inside @a transposeRef
void _generateMatrix(std::vector<float>& in, std::vector<float>& transposeRef, uint nx, uint ny)
{
// generate a matrix
for(uint i = 0 ; i < nx ; ++i)
{
for(uint j = 0 ; j < ny ; ++j)
{
in[i*ny + j] = i*ny + j;
}
}
// store transposed result
for(uint j = 0; j < ny ; ++j)
{
for(uint i = 0 ; i < nx ; ++i)
{
transposeRef[j*nx + i] = in[i*ny + j];
}
}
}
#define _BEGIN_TEST(name) std::cout << name << std::endl;
#define _END std::cout << std::endl;
int main()
{
const uint nx = 4096;
const uint ny = 4096;
std::cout << "Matrix Size: " << nx << "x" << ny << std::endl;
std::cout << "Grid Size: " << nx/TILE_DIM << "x" << ny/TILE_DIM << " blocks" << std::endl;
std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
std::cout << std::endl;
const size_t g_workSize[2] = {nx,ny*BLOCK_ROWS/TILE_DIM};
const size_t l_workSize[2] = {TILE_DIM,BLOCK_ROWS};
const uint size = nx * ny;
std::vector<float> h_input(size);
std::vector<float> h_output(size);
std::vector<float> expectedResult(size);
_generateMatrix(h_input,expectedResult,nx,ny);
// get the default device
compute::device device = compute::system::default_device();
// create a context for the device
compute::context context(device);
// device vectors
compute::vector<float> d_input(size,context);
compute::vector<float> d_output(size,context);
// command_queue with profiling
compute::command_queue queue(context, device,compute::command_queue::enable_profiling);
// copy input data
compute::copy(h_input.begin(),h_input.end(),d_input.begin(),queue);
compute::program copy_program = _copyKernel(context);
compute::kernel kernel(copy_program,"copy_kernel");
kernel.set_arg(0,d_input);
kernel.set_arg(1,d_output);
compute::event start;
_BEGIN_TEST("Copy_Kernel");
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
queue.finish();
uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
if(_checkTransposition(h_input,nx*ny,h_output))
std::cout << "\tStatus: Success" << std::endl;
else
std::cout << "\tStatus: Error" << std::endl;
_END
_BEGIN_TEST("naiveTranspose")
kernel = compute::kernel(_naiveTransposeKernel(context),"naiveTranspose");
kernel.set_arg(0,d_input);
kernel.set_arg(1,d_output);
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
queue.finish();
elapsed = start.duration<boost::chrono::nanoseconds>().count();
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
if(_checkTransposition(expectedResult,nx*ny,h_output))
std::cout << "\tStatus: Success" << std::endl;
else
std::cout << "\tStatus: Error" << std::endl;
_END
_BEGIN_TEST("coalescedTranspose")
kernel = compute::kernel(_coalescedTransposeKernel(context),"coalescedTranspose");
kernel.set_arg(0,d_input);
kernel.set_arg(1,d_output);
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
queue.finish();
elapsed = start.duration<boost::chrono::nanoseconds>().count();
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
if(_checkTransposition(expectedResult,nx*ny,h_output))
std::cout << "\tStatus: Success" << std::endl;
else
std::cout << "\tStatus: Error" << std::endl;
_END
_BEGIN_TEST("coalescedNoBankConflicts")
kernel = compute::kernel(_coalescedNoBankConflictsKernel(context),"coalescedNoBankConflicts");
kernel.set_arg(0,d_input);
kernel.set_arg(1,d_output);
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
queue.finish();
elapsed = start.duration<boost::chrono::nanoseconds>().count();
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
if(_checkTransposition(expectedResult,nx*ny,h_output))
std::cout << "\tStatus: Success" << std::endl;
else
std::cout << "\tStatus: Error" << std::endl;
_END
return 0;
}

View File

@@ -0,0 +1,133 @@
#include <iostream>
#include <cstdlib>
#include <boost/compute.hpp>
#include <boost/compute/type_traits/type_name.hpp>
namespace compute = boost::compute;
/// warning precision is not precise due
/// to the float error accumulation when size is large enough
/// for more precision use double
/// or a kahan sum else results can diverge
/// from the CPU implementation
compute::program _sma_program(const compute::context& context)
{
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
__kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize)
{
const int gid = get_global_id(0);
float cumValues = 0.;
int endIdx = gid + wSize/2;
int startIdx = gid -1 - wSize/2;
if(endIdx > size -1)
endIdx = size -1;
cumValues += scannedValues[endIdx];
if(startIdx < 0)
startIdx = -1;
else
cumValues -= scannedValues[startIdx];
output[gid] =(float)( cumValues / ( float )(endIdx - startIdx));
}
);
// create sma program
compute::program program = compute::program::build_with_source(source,context);
return program;
}
bool _check(const std::vector<float>& values, const std::vector<float>& smoothValues, unsigned int wSize)
{
int size = values.size();
if(size != (int)smoothValues.size()) return false;
int semiWidth = wSize/2;
bool res = true;
for(int idx = 0 ; idx < size ; ++idx)
{
int start = std::max(idx - semiWidth,0);
int end = std::min(idx + semiWidth,size-1);
float res = 0;
for(int j = start ; j <= end ; ++j)
{
res+= values[j];
}
res /= float(end - start +1);
if(std::abs(res-smoothValues[idx]) > 1e-3)
{
std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl;
res = false;
}
}
return res;
}
// generate a uniform law over [0,10[
float myRand()
{
static const double divisor = double(RAND_MAX)+1.;
return double(rand())/divisor * 10.;
}
int main()
{
unsigned int size = 1024;
// wSize must be odd
unsigned int wSize = 21;
// get the default device
compute::device device = compute::system::default_device();
// create a context for the device
compute::context context(device);
// get the program
compute::program program = _sma_program(context);
// create vector of random numbers on the host
std::vector<float> host_vector(size);
std::vector<float> host_result(size);
std::generate(host_vector.begin(), host_vector.end(), myRand);
compute::vector<float> a(size,context);
compute::vector<float> b(size,context);
compute::vector<float> c(size,context);
compute::command_queue queue(context, device);
compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue);
// scan values
compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue);
// sma kernel
compute::kernel kernel(program, "SMA");
kernel.set_arg(0,b.get_buffer());
kernel.set_arg(1,(int)b.size());
kernel.set_arg(2,c.get_buffer());
kernel.set_arg(3,(int)wSize);
uint tpb = 128;
uint workSize = size;
queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb);
compute::copy(c.begin(),c.end(),host_result.begin(),queue);
bool res = _check(host_vector,host_result,wSize);
std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's";
std::cout << status << std::endl;
return 0;
}

View File

@@ -29,6 +29,7 @@ set(BENCHMARKS
sort_float
unique
unique_copy
exclusive_scan
)
foreach(BENCHMARK ${BENCHMARKS})
@@ -70,6 +71,7 @@ if(${BOOST_COMPUTE_HAVE_CUDA})
thrust_partial_sum
thrust_saxpy
thrust_sort
thrust_exclusive_scan
)
foreach(BENCHMARK ${CUDA_BENCHMARKS})

View File

@@ -0,0 +1,87 @@
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/exclusive_scan.hpp>
#include <boost/compute/container/vector.hpp>
#include "perf.hpp"
int rand_int()
{
return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
}
int main(int argc, char *argv[])
{
perf_parse_args(argc, argv);
std::cout << "size: " << PERF_N << std::endl;
// setup context and queue for the default device
boost::compute::device device = boost::compute::system::default_device();
boost::compute::context context(device);
boost::compute::command_queue queue(context, device);
std::cout << "device: " << device.name() << std::endl;
// create vector of random numbers on the host
std::vector<int> host_vector(PERF_N);
std::generate(host_vector.begin(), host_vector.end(), rand_int);
// create vector on the device and copy the data
boost::compute::vector<int> device_vector(PERF_N, context);
boost::compute::vector<int> device_res(PERF_N,context);
boost::compute::copy(
host_vector.begin(),
host_vector.end(),
device_vector.begin(),
queue
);
// sum vector
perf_timer t;
for(size_t trial = 0; trial < PERF_TRIALS; trial++){
boost::compute::copy(
host_vector.begin(),
host_vector.end(),
device_vector.begin(),
queue
);
t.start();
boost::compute::exclusive_scan(
device_vector.begin(),
device_vector.end(),
device_res.begin(),
queue
);
queue.finish();
t.stop();
}
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
// verify sum is correct
std::partial_sum(
host_vector.begin(),
host_vector.end(),
host_vector.begin()
);
int device_sum = device_res.back();
// when scan is exclusive values are shifted by one on the left
// compared to a inclusive scan
int host_sum = host_vector[host_vector.size()-2];
if(device_sum != host_sum){
std::cout << "ERROR: "
<< "device_sum (" << device_sum << ") "
<< "!= "
<< "host_sum (" << host_sum << ")"
<< std::endl;
return -1;
}
return 0;
}

View File

@@ -42,6 +42,7 @@ int main(int argc, char *argv[])
// create vector on the device and copy the data
boost::compute::vector<int> device_vector(PERF_N, context);
boost::compute::vector<int> device_res(PERF_N,context);
boost::compute::copy(
host_vector.begin(),
host_vector.end(),
@@ -63,7 +64,7 @@ int main(int argc, char *argv[])
boost::compute::partial_sum(
device_vector.begin(),
device_vector.end(),
device_vector.begin(),
device_res.begin(),
queue
);
queue.finish();
@@ -78,7 +79,7 @@ int main(int argc, char *argv[])
host_vector.begin()
);
int device_sum = device_vector.back();
int device_sum = device_res.back();
int host_sum = host_vector.back();
if(device_sum != host_sum){

View File

@@ -0,0 +1,37 @@
#include <algorithm>
#include <cstdlib>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include "perf.hpp"
int main(int argc, char *argv[])
{
perf_parse_args(argc, argv);
std::cout << "size: " << PERF_N << std::endl;
thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
// transfer data to the device
thrust::device_vector<int> d_vec = h_vec;
perf_timer t;
for(size_t trial = 0; trial < PERF_TRIALS; trial++){
d_vec = h_vec;
t.start();
thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
cudaDeviceSynchronize();
t.stop();
}
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
return 0;
}