mirror of
https://github.com/boostorg/compute.git
synced 2026-01-27 18:52:15 +00:00
Merge pull request #111 from banche/develop
matrix transpose and simple moving average examples + exclusive_scan perf tests
This commit is contained in:
@@ -20,6 +20,8 @@ set(EXAMPLES
|
||||
time_copy
|
||||
transform_sqrt
|
||||
vector_addition
|
||||
simple_moving_average
|
||||
matrix_transpose
|
||||
)
|
||||
|
||||
if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})
|
||||
|
||||
321
example/matrix_transpose.cpp
Normal file
321
example/matrix_transpose.cpp
Normal file
@@ -0,0 +1,321 @@
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <boost/compute.hpp>
|
||||
#include <boost/compute/type_traits/type_name.hpp>
|
||||
|
||||
namespace compute = boost::compute;
|
||||
|
||||
#define TILE_DIM 32
|
||||
#define BLOCK_ROWS 8
|
||||
|
||||
/// \fn _copyKernel
|
||||
/// \brief generate a copy kernel program
|
||||
compute::program _copyKernel(const compute::context& context)
|
||||
{
|
||||
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
||||
__kernel void copy_kernel(__global const float *src, __global float *dst)
|
||||
{
|
||||
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
||||
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
||||
|
||||
uint width = get_num_groups(0) * TILE_DIM;
|
||||
|
||||
for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
|
||||
{
|
||||
dst[(y+i)*width +x] = src[(y+i)*width + x];
|
||||
}
|
||||
}
|
||||
);
|
||||
// create copy program
|
||||
std::stringstream options;
|
||||
options << "-DTILE_DIM=" << TILE_DIM
|
||||
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
||||
compute::program program = compute::program::build_with_source(source,context,options.str());
|
||||
return program;
|
||||
}
|
||||
|
||||
/// \fn _naiveTransposeKernel
|
||||
/// \brief generate a naive transpose kernel program
|
||||
compute::program _naiveTransposeKernel(const compute::context& context)
|
||||
{
|
||||
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
||||
__kernel void naiveTranspose(__global const float *src, __global float *dst)
|
||||
{
|
||||
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
||||
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
||||
|
||||
uint width = get_num_groups(0) * TILE_DIM;
|
||||
|
||||
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
||||
{
|
||||
dst[x*width + y+i] = src[(y+i)*width + x];
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// create naiveTranspose program
|
||||
std::stringstream options;
|
||||
options << "-DTILE_DIM=" << TILE_DIM
|
||||
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
||||
compute::program program = compute::program::build_with_source(source,context,options.str());
|
||||
return program;
|
||||
}
|
||||
|
||||
/// \fn _coalescedTransposeKernel
|
||||
/// \brief generate a coalesced transpose kernel program
|
||||
compute::program _coalescedTransposeKernel(const compute::context& context)
|
||||
{
|
||||
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
||||
__kernel void coalescedTranspose(__global const float *src, __global float *dst)
|
||||
{
|
||||
__local float tile[TILE_DIM][TILE_DIM];
|
||||
|
||||
// compute indexes
|
||||
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
||||
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
||||
|
||||
uint width = get_num_groups(0) * TILE_DIM;
|
||||
|
||||
// load inside local memory
|
||||
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
||||
{
|
||||
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// transpose indexes
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
|
||||
// write output from local memory
|
||||
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
|
||||
{
|
||||
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
|
||||
}
|
||||
|
||||
}
|
||||
);
|
||||
|
||||
// create naiveTranspose program
|
||||
std::stringstream options;
|
||||
options << "-DTILE_DIM=" << TILE_DIM
|
||||
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
||||
compute::program program = compute::program::build_with_source(source,context,options.str());
|
||||
return program;
|
||||
}
|
||||
|
||||
/// \fn _coalescedNoBankConflictsKernel
|
||||
/// \brief generate a coalesced withtout bank conflicts kernel program
|
||||
compute::program _coalescedNoBankConflictsKernel(const compute::context& context)
|
||||
{
|
||||
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
||||
__kernel void coalescedNoBankConflicts(__global const float *src, __global float *dst)
|
||||
{
|
||||
// TILE_DIM+1 is here to avoid bank conflicts in local memory
|
||||
__local float tile[TILE_DIM][TILE_DIM+1];
|
||||
|
||||
// compute indexes
|
||||
uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
|
||||
uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
|
||||
|
||||
uint width = get_num_groups(0) * TILE_DIM;
|
||||
|
||||
// load inside local memory
|
||||
for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
|
||||
{
|
||||
tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// transpose indexes
|
||||
x = get_group_id(1) * TILE_DIM + get_local_id(0);
|
||||
y = get_group_id(0) * TILE_DIM + get_local_id(1);
|
||||
// write output from local memory
|
||||
for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
|
||||
{
|
||||
dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
|
||||
}
|
||||
|
||||
}
|
||||
);
|
||||
|
||||
// create naiveTranspose program
|
||||
std::stringstream options;
|
||||
options << "-DTILE_DIM=" << TILE_DIM
|
||||
<< " -DBLOCK_ROWS=" << BLOCK_ROWS;
|
||||
compute::program program = compute::program::build_with_source(source,context,options.str());
|
||||
return program;
|
||||
}
|
||||
|
||||
/// \fn _checkTransposition
|
||||
/// \brief Compare @a expectedResult to @a transposedMatrix
|
||||
bool _checkTransposition(const std::vector<float>& expectedResult,
|
||||
uint size,
|
||||
const std::vector<float>& transposedMatrix)
|
||||
{
|
||||
for(uint i = 0 ; i < size ; ++i)
|
||||
{
|
||||
if(expectedResult[i] != transposedMatrix[i])
|
||||
{
|
||||
std::cout << "idx = " << i << " , expected " << expectedResult[i] <<
|
||||
" , got " << transposedMatrix[i] << std::endl;
|
||||
std::cout << "FAILED" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \fn _generateMatrix
|
||||
/// \brief generate a matrix inside @a in and do the tranposition inside @a transposeRef
|
||||
void _generateMatrix(std::vector<float>& in, std::vector<float>& transposeRef, uint nx, uint ny)
|
||||
{
|
||||
// generate a matrix
|
||||
for(uint i = 0 ; i < nx ; ++i)
|
||||
{
|
||||
for(uint j = 0 ; j < ny ; ++j)
|
||||
{
|
||||
in[i*ny + j] = i*ny + j;
|
||||
}
|
||||
}
|
||||
|
||||
// store transposed result
|
||||
for(uint j = 0; j < ny ; ++j)
|
||||
{
|
||||
for(uint i = 0 ; i < nx ; ++i)
|
||||
{
|
||||
transposeRef[j*nx + i] = in[i*ny + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define _BEGIN_TEST(name) std::cout << name << std::endl;
|
||||
#define _END std::cout << std::endl;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
const uint nx = 4096;
|
||||
const uint ny = 4096;
|
||||
|
||||
std::cout << "Matrix Size: " << nx << "x" << ny << std::endl;
|
||||
std::cout << "Grid Size: " << nx/TILE_DIM << "x" << ny/TILE_DIM << " blocks" << std::endl;
|
||||
std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
const size_t g_workSize[2] = {nx,ny*BLOCK_ROWS/TILE_DIM};
|
||||
const size_t l_workSize[2] = {TILE_DIM,BLOCK_ROWS};
|
||||
|
||||
const uint size = nx * ny;
|
||||
|
||||
std::vector<float> h_input(size);
|
||||
std::vector<float> h_output(size);
|
||||
std::vector<float> expectedResult(size);
|
||||
_generateMatrix(h_input,expectedResult,nx,ny);
|
||||
|
||||
// get the default device
|
||||
compute::device device = compute::system::default_device();
|
||||
|
||||
// create a context for the device
|
||||
compute::context context(device);
|
||||
|
||||
// device vectors
|
||||
compute::vector<float> d_input(size,context);
|
||||
compute::vector<float> d_output(size,context);
|
||||
|
||||
// command_queue with profiling
|
||||
compute::command_queue queue(context, device,compute::command_queue::enable_profiling);
|
||||
|
||||
// copy input data
|
||||
compute::copy(h_input.begin(),h_input.end(),d_input.begin(),queue);
|
||||
|
||||
compute::program copy_program = _copyKernel(context);
|
||||
compute::kernel kernel(copy_program,"copy_kernel");
|
||||
kernel.set_arg(0,d_input);
|
||||
kernel.set_arg(1,d_output);
|
||||
|
||||
compute::event start;
|
||||
_BEGIN_TEST("Copy_Kernel");
|
||||
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
||||
queue.finish();
|
||||
uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
||||
|
||||
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
||||
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
||||
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
||||
|
||||
if(_checkTransposition(h_input,nx*ny,h_output))
|
||||
std::cout << "\tStatus: Success" << std::endl;
|
||||
else
|
||||
std::cout << "\tStatus: Error" << std::endl;
|
||||
_END
|
||||
|
||||
_BEGIN_TEST("naiveTranspose")
|
||||
|
||||
kernel = compute::kernel(_naiveTransposeKernel(context),"naiveTranspose");
|
||||
kernel.set_arg(0,d_input);
|
||||
kernel.set_arg(1,d_output);
|
||||
|
||||
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
||||
queue.finish();
|
||||
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
||||
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
||||
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
||||
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
||||
|
||||
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
||||
std::cout << "\tStatus: Success" << std::endl;
|
||||
else
|
||||
std::cout << "\tStatus: Error" << std::endl;
|
||||
_END
|
||||
|
||||
_BEGIN_TEST("coalescedTranspose")
|
||||
|
||||
kernel = compute::kernel(_coalescedTransposeKernel(context),"coalescedTranspose");
|
||||
kernel.set_arg(0,d_input);
|
||||
kernel.set_arg(1,d_output);
|
||||
|
||||
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
||||
queue.finish();
|
||||
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
||||
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
||||
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
||||
|
||||
|
||||
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
||||
|
||||
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
||||
std::cout << "\tStatus: Success" << std::endl;
|
||||
else
|
||||
std::cout << "\tStatus: Error" << std::endl;
|
||||
_END
|
||||
|
||||
_BEGIN_TEST("coalescedNoBankConflicts")
|
||||
|
||||
kernel = compute::kernel(_coalescedNoBankConflictsKernel(context),"coalescedNoBankConflicts");
|
||||
kernel.set_arg(0,d_input);
|
||||
kernel.set_arg(1,d_output);
|
||||
|
||||
start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
|
||||
queue.finish();
|
||||
elapsed = start.duration<boost::chrono::nanoseconds>().count();
|
||||
std::cout << "\tElapsed: " << elapsed << " ns"<< std::endl;
|
||||
std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float) / elapsed << " GB/s" << std::endl;
|
||||
|
||||
|
||||
compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
|
||||
|
||||
if(_checkTransposition(expectedResult,nx*ny,h_output))
|
||||
std::cout << "\tStatus: Success" << std::endl;
|
||||
else
|
||||
std::cout << "\tStatus: Error" << std::endl;
|
||||
_END
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
133
example/simple_moving_average.cpp
Normal file
133
example/simple_moving_average.cpp
Normal file
@@ -0,0 +1,133 @@
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <boost/compute.hpp>
|
||||
#include <boost/compute/type_traits/type_name.hpp>
|
||||
|
||||
namespace compute = boost::compute;
|
||||
|
||||
|
||||
|
||||
/// warning precision is not precise due
|
||||
/// to the float error accumulation when size is large enough
|
||||
/// for more precision use double
|
||||
/// or a kahan sum else results can diverge
|
||||
/// from the CPU implementation
|
||||
compute::program _sma_program(const compute::context& context)
|
||||
{
|
||||
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
|
||||
__kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize)
|
||||
{
|
||||
const int gid = get_global_id(0);
|
||||
|
||||
float cumValues = 0.;
|
||||
int endIdx = gid + wSize/2;
|
||||
int startIdx = gid -1 - wSize/2;
|
||||
|
||||
if(endIdx > size -1)
|
||||
endIdx = size -1;
|
||||
|
||||
cumValues += scannedValues[endIdx];
|
||||
if(startIdx < 0)
|
||||
startIdx = -1;
|
||||
else
|
||||
cumValues -= scannedValues[startIdx];
|
||||
|
||||
output[gid] =(float)( cumValues / ( float )(endIdx - startIdx));
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
// create sma program
|
||||
compute::program program = compute::program::build_with_source(source,context);
|
||||
return program;
|
||||
}
|
||||
|
||||
|
||||
bool _check(const std::vector<float>& values, const std::vector<float>& smoothValues, unsigned int wSize)
|
||||
{
|
||||
int size = values.size();
|
||||
if(size != (int)smoothValues.size()) return false;
|
||||
|
||||
int semiWidth = wSize/2;
|
||||
|
||||
bool res = true;
|
||||
for(int idx = 0 ; idx < size ; ++idx)
|
||||
{
|
||||
int start = std::max(idx - semiWidth,0);
|
||||
int end = std::min(idx + semiWidth,size-1);
|
||||
float res = 0;
|
||||
for(int j = start ; j <= end ; ++j)
|
||||
{
|
||||
res+= values[j];
|
||||
}
|
||||
|
||||
res /= float(end - start +1);
|
||||
|
||||
if(std::abs(res-smoothValues[idx]) > 1e-3)
|
||||
{
|
||||
std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl;
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
// generate a uniform law over [0,10[
|
||||
float myRand()
|
||||
{
|
||||
static const double divisor = double(RAND_MAX)+1.;
|
||||
return double(rand())/divisor * 10.;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
unsigned int size = 1024;
|
||||
// wSize must be odd
|
||||
unsigned int wSize = 21;
|
||||
// get the default device
|
||||
compute::device device = compute::system::default_device();
|
||||
// create a context for the device
|
||||
compute::context context(device);
|
||||
// get the program
|
||||
compute::program program = _sma_program(context);
|
||||
|
||||
// create vector of random numbers on the host
|
||||
std::vector<float> host_vector(size);
|
||||
std::vector<float> host_result(size);
|
||||
std::generate(host_vector.begin(), host_vector.end(), myRand);
|
||||
|
||||
compute::vector<float> a(size,context);
|
||||
compute::vector<float> b(size,context);
|
||||
compute::vector<float> c(size,context);
|
||||
compute::command_queue queue(context, device);
|
||||
|
||||
compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue);
|
||||
|
||||
// scan values
|
||||
compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue);
|
||||
// sma kernel
|
||||
compute::kernel kernel(program, "SMA");
|
||||
kernel.set_arg(0,b.get_buffer());
|
||||
kernel.set_arg(1,(int)b.size());
|
||||
kernel.set_arg(2,c.get_buffer());
|
||||
kernel.set_arg(3,(int)wSize);
|
||||
|
||||
uint tpb = 128;
|
||||
uint workSize = size;
|
||||
queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb);
|
||||
|
||||
compute::copy(c.begin(),c.end(),host_result.begin(),queue);
|
||||
|
||||
bool res = _check(host_vector,host_result,wSize);
|
||||
std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's";
|
||||
std::cout << status << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ set(BENCHMARKS
|
||||
sort_float
|
||||
unique
|
||||
unique_copy
|
||||
exclusive_scan
|
||||
)
|
||||
|
||||
foreach(BENCHMARK ${BENCHMARKS})
|
||||
@@ -70,6 +71,7 @@ if(${BOOST_COMPUTE_HAVE_CUDA})
|
||||
thrust_partial_sum
|
||||
thrust_saxpy
|
||||
thrust_sort
|
||||
thrust_exclusive_scan
|
||||
)
|
||||
|
||||
foreach(BENCHMARK ${CUDA_BENCHMARKS})
|
||||
|
||||
87
perf/perf_exclusive_scan.cpp
Normal file
87
perf/perf_exclusive_scan.cpp
Normal file
@@ -0,0 +1,87 @@
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/compute/system.hpp>
|
||||
#include <boost/compute/algorithm/exclusive_scan.hpp>
|
||||
#include <boost/compute/container/vector.hpp>
|
||||
|
||||
#include "perf.hpp"
|
||||
|
||||
int rand_int()
|
||||
{
|
||||
return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
perf_parse_args(argc, argv);
|
||||
|
||||
std::cout << "size: " << PERF_N << std::endl;
|
||||
|
||||
// setup context and queue for the default device
|
||||
boost::compute::device device = boost::compute::system::default_device();
|
||||
boost::compute::context context(device);
|
||||
boost::compute::command_queue queue(context, device);
|
||||
std::cout << "device: " << device.name() << std::endl;
|
||||
|
||||
// create vector of random numbers on the host
|
||||
std::vector<int> host_vector(PERF_N);
|
||||
std::generate(host_vector.begin(), host_vector.end(), rand_int);
|
||||
|
||||
// create vector on the device and copy the data
|
||||
boost::compute::vector<int> device_vector(PERF_N, context);
|
||||
boost::compute::vector<int> device_res(PERF_N,context);
|
||||
boost::compute::copy(
|
||||
host_vector.begin(),
|
||||
host_vector.end(),
|
||||
device_vector.begin(),
|
||||
queue
|
||||
);
|
||||
|
||||
// sum vector
|
||||
perf_timer t;
|
||||
for(size_t trial = 0; trial < PERF_TRIALS; trial++){
|
||||
boost::compute::copy(
|
||||
host_vector.begin(),
|
||||
host_vector.end(),
|
||||
device_vector.begin(),
|
||||
queue
|
||||
);
|
||||
|
||||
t.start();
|
||||
boost::compute::exclusive_scan(
|
||||
device_vector.begin(),
|
||||
device_vector.end(),
|
||||
device_res.begin(),
|
||||
queue
|
||||
);
|
||||
queue.finish();
|
||||
t.stop();
|
||||
}
|
||||
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
|
||||
|
||||
// verify sum is correct
|
||||
std::partial_sum(
|
||||
host_vector.begin(),
|
||||
host_vector.end(),
|
||||
host_vector.begin()
|
||||
);
|
||||
|
||||
int device_sum = device_res.back();
|
||||
// when scan is exclusive values are shifted by one on the left
|
||||
// compared to a inclusive scan
|
||||
int host_sum = host_vector[host_vector.size()-2];
|
||||
|
||||
if(device_sum != host_sum){
|
||||
std::cout << "ERROR: "
|
||||
<< "device_sum (" << device_sum << ") "
|
||||
<< "!= "
|
||||
<< "host_sum (" << host_sum << ")"
|
||||
<< std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -42,6 +42,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
// create vector on the device and copy the data
|
||||
boost::compute::vector<int> device_vector(PERF_N, context);
|
||||
boost::compute::vector<int> device_res(PERF_N,context);
|
||||
boost::compute::copy(
|
||||
host_vector.begin(),
|
||||
host_vector.end(),
|
||||
@@ -63,7 +64,7 @@ int main(int argc, char *argv[])
|
||||
boost::compute::partial_sum(
|
||||
device_vector.begin(),
|
||||
device_vector.end(),
|
||||
device_vector.begin(),
|
||||
device_res.begin(),
|
||||
queue
|
||||
);
|
||||
queue.finish();
|
||||
@@ -78,7 +79,7 @@ int main(int argc, char *argv[])
|
||||
host_vector.begin()
|
||||
);
|
||||
|
||||
int device_sum = device_vector.back();
|
||||
int device_sum = device_res.back();
|
||||
int host_sum = host_vector.back();
|
||||
|
||||
if(device_sum != host_sum){
|
||||
|
||||
37
perf/perf_thrust_exclusive_scan.cu
Normal file
37
perf/perf_thrust_exclusive_scan.cu
Normal file
@@ -0,0 +1,37 @@
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/generate.h>
|
||||
#include <thrust/host_vector.h>
|
||||
#include <thrust/scan.h>
|
||||
|
||||
#include "perf.hpp"
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
perf_parse_args(argc, argv);
|
||||
|
||||
std::cout << "size: " << PERF_N << std::endl;
|
||||
thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
|
||||
|
||||
// transfer data to the device
|
||||
thrust::device_vector<int> d_vec = h_vec;
|
||||
|
||||
perf_timer t;
|
||||
for(size_t trial = 0; trial < PERF_TRIALS; trial++){
|
||||
d_vec = h_vec;
|
||||
|
||||
t.start();
|
||||
thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
|
||||
cudaDeviceSynchronize();
|
||||
t.stop();
|
||||
}
|
||||
std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
|
||||
|
||||
// transfer data back to host
|
||||
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user