mirror of
https://github.com/boostorg/compute.git
synced 2026-01-26 06:22:37 +00:00
322 lines
11 KiB
C++
322 lines
11 KiB
C++
#include <iostream>
|
|
#include <cstdlib>
|
|
|
|
#include <boost/compute.hpp>
|
|
#include <boost/compute/type_traits/type_name.hpp>
|
|
|
|
namespace compute = boost::compute;
|
|
|
|
#define TILE_DIM 32
|
|
#define BLOCK_ROWS 8
|
|
|
|
/// \fn _copyKernel
|
|
/// \brief generate a copy kernel program
|
|
compute::program _copyKernel(const compute::context& context)
|
|
{
|
|
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
|
__kernel void copy_kernel(__global const float *src, __global float *dst)
|
|
{
|
|
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
|
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
|
|
|
uint width = get_num_groups(0) * TILE_DIM;
|
|
|
|
for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
|
|
{
|
|
dst[(y+i)*width +x] = src[(y+i)*width + x];
|
|
}
|
|
}
|
|
);
|
|
// create copy program
|
|
std::stringstream options;
|
|
options << "-DTILE_DIM=" << TILE_DIM
|
|
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
|
compute::program program = compute::program::build_with_source(source,context,options.str());
|
|
return program;
|
|
}
|
|
|
|
/// \fn _naiveTransposeKernel
|
|
/// \brief generate a naive transpose kernel program
|
|
compute::program _naiveTransposeKernel(const compute::context& context)
|
|
{
|
|
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
|
__kernel void naiveTranspose(__global const float *src, __global float *dst)
|
|
{
|
|
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
|
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
|
|
|
uint width = get_num_groups(0) * TILE_DIM;
|
|
|
|
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
|
{
|
|
dst[x*width + y+i] = src[(y+i)*width + x];
|
|
}
|
|
}
|
|
);
|
|
|
|
// create naiveTranspose program
|
|
std::stringstream options;
|
|
options << "-DTILE_DIM=" << TILE_DIM
|
|
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
|
compute::program program = compute::program::build_with_source(source,context,options.str());
|
|
return program;
|
|
}
|
|
|
|
/// \fn _coalescedTransposeKernel
|
|
/// \brief generate a coalesced transpose kernel program
|
|
compute::program _coalescedTransposeKernel(const compute::context& context)
|
|
{
|
|
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
|
__kernel void coalescedTranspose(__global const float *src, __global float *dst)
|
|
{
|
|
__local float tile[TILE_DIM][TILE_DIM];
|
|
|
|
// compute indexes
|
|
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
|
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
|
|
|
uint width = get_num_groups(0) * TILE_DIM;
|
|
|
|
// load inside local memory
|
|
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
|
{
|
|
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// transpose indexes
|
|
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
|
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
|
|
|
// write output from local memory
|
|
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
|
|
{
|
|
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
|
|
}
|
|
|
|
}
|
|
);
|
|
|
|
// create naiveTranspose program
|
|
std::stringstream options;
|
|
options << "-DTILE_DIM=" << TILE_DIM
|
|
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
|
compute::program program = compute::program::build_with_source(source,context,options.str());
|
|
return program;
|
|
}
|
|
|
|
/// \fn _coalescedNoBankConflictsKernel
|
|
/// \brief generate a coalesced withtout bank conflicts kernel program
|
|
compute::program _coalescedNoBankConflictsKernel(const compute::context& context)
|
|
{
|
|
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
|
__kernel void coalescedNoBankConflicts(__global const float *src, __global float *dst)
|
|
{
|
|
// TILE_DIM+1 is here to avoid bank conflicts in local memory
|
|
__local float tile[TILE_DIM][TILE_DIM+1];
|
|
|
|
// compute indexes
|
|
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
|
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
|
|
|
uint width = get_num_groups(0) * TILE_DIM;
|
|
|
|
// load inside local memory
|
|
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
|
{
|
|
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
|
|
}
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// transpose indexes
|
|
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
|
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
|
// write output from local memory
|
|
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
|
|
{
|
|
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
|
|
}
|
|
|
|
}
|
|
);
|
|
|
|
// create naiveTranspose program
|
|
std::stringstream options;
|
|
options << "-DTILE_DIM=" << TILE_DIM
|
|
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
|
compute::program program = compute::program::build_with_source(source,context,options.str());
|
|
return program;
|
|
}
|
|
|
|
/// \fn _checkTransposition
|
|
/// \brief Compare @a expectedResult to @a transposedMatrix
|
|
bool _checkTransposition(const std::vector<float>& expectedResult,
|
|
uint size,
|
|
const std::vector<float>& transposedMatrix)
|
|
{
|
|
for(uint i = 0 ; i < size ; ++i)
|
|
{
|
|
if(expectedResult[i] != transposedMatrix[i])
|
|
{
|
|
std::cout << "idx = " << i << " , expected " << expectedResult[i] <<
|
|
" , got " << transposedMatrix[i] << std::endl;
|
|
std::cout << "FAILED" << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// \fn _generateMatrix
|
|
/// \brief generate a matrix inside @a in and do the tranposition inside @a transposeRef
|
|
void _generateMatrix(std::vector<float>& in, std::vector<float>& transposeRef, uint nx, uint ny)
|
|
{
|
|
// generate a matrix
|
|
for(uint i = 0 ; i < nx ; ++i)
|
|
{
|
|
for(uint j = 0 ; j < ny ; ++j)
|
|
{
|
|
in[i*ny + j] = i*ny + j;
|
|
}
|
|
}
|
|
|
|
// store transposed result
|
|
for(uint j = 0; j < ny ; ++j)
|
|
{
|
|
for(uint i = 0 ; i < nx ; ++i)
|
|
{
|
|
transposeRef[j*nx + i] = in[i*ny + j];
|
|
}
|
|
}
|
|
}
|
|
|
|
#define _BEGIN_TEST(name) std::cout << name << std::endl;
|
|
#define _END std::cout << std::endl;
|
|
|
|
|
|
int main()
|
|
{
|
|
const uint nx = 4096;
|
|
const uint ny = 4096;
|
|
|
|
std::cout << "Matrix Size: " << nx << "x" << ny << std::endl;
|
|
std::cout << "Grid Size: " << nx/TILE_DIM << "x" << ny/TILE_DIM << " blocks" << std::endl;
|
|
std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
|
|
|
|
std::cout << std::endl;
|
|
|
|
const size_t g_workSize[2] = {nx,ny*BLOCK_ROWS/TILE_DIM};
|
|
const size_t l_workSize[2] = {TILE_DIM,BLOCK_ROWS};
|
|
|
|
const uint size = nx * ny;
|
|
|
|
std::vector<float> h_input(size);
|
|
std::vector<float> h_output(size);
|
|
std::vector<float> expectedResult(size);
|
|
_generateMatrix(h_input,expectedResult,nx,ny);
|
|
|
|
// get the default device
|
|
compute::device device = compute::system::default_device();
|
|
|
|
// create a context for the device
|
|
compute::context context(device);
|
|
|
|
// device vectors
|
|
compute::vector<float> d_input(size,context);
|
|
compute::vector<float> d_output(size,context);
|
|
|
|
// command_queue with profiling
|
|
compute::command_queue queue(context, device,compute::command_queue::enable_profiling);
|
|
|
|
// copy input data
|
|
compute::copy(h_input.begin(),h_input.end(),d_input.begin(),queue);
|
|
|
|
compute::program copy_program = _copyKernel(context);
|
|
compute::kernel kernel(copy_program,"copy_kernel");
|
|
kernel.set_arg(0,d_input);
|
|
kernel.set_arg(1,d_output);
|
|
|
|
compute::event start;
|
|
_BEGIN_TEST("Copy_Kernel");
|
|
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
|
queue.finish();
|
|
uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
|
|
|
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
|
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
|
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
|
|
|
if(_checkTransposition(h_input,nx*ny,h_output))
|
|
std::cout << "\tStatus: Success" << std::endl;
|
|
else
|
|
std::cout << "\tStatus: Error" << std::endl;
|
|
_END
|
|
|
|
_BEGIN_TEST("naiveTranspose")
|
|
|
|
kernel = compute::kernel(_naiveTransposeKernel(context),"naiveTranspose");
|
|
kernel.set_arg(0,d_input);
|
|
kernel.set_arg(1,d_output);
|
|
|
|
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
|
queue.finish();
|
|
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
|
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
|
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
|
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
|
|
|
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
|
std::cout << "\tStatus: Success" << std::endl;
|
|
else
|
|
std::cout << "\tStatus: Error" << std::endl;
|
|
_END
|
|
|
|
_BEGIN_TEST("coalescedTranspose")
|
|
|
|
kernel = compute::kernel(_coalescedTransposeKernel(context),"coalescedTranspose");
|
|
kernel.set_arg(0,d_input);
|
|
kernel.set_arg(1,d_output);
|
|
|
|
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
|
queue.finish();
|
|
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
|
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
|
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
|
|
|
|
|
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
|
|
|
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
|
std::cout << "\tStatus: Success" << std::endl;
|
|
else
|
|
std::cout << "\tStatus: Error" << std::endl;
|
|
_END
|
|
|
|
_BEGIN_TEST("coalescedNoBankConflicts")
|
|
|
|
kernel = compute::kernel(_coalescedNoBankConflictsKernel(context),"coalescedNoBankConflicts");
|
|
kernel.set_arg(0,d_input);
|
|
kernel.set_arg(1,d_output);
|
|
|
|
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
|
queue.finish();
|
|
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
|
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
|
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
|
|
|
|
|
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
|
|
|
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
|
std::cout << "\tStatus: Success" << std::endl;
|
|
else
|
|
std::cout << "\tStatus: Error" << std::endl;
|
|
_END
|
|
|
|
return 0;
|
|
}
|
|
|
|
|