Merge pull request #111 from banche/develop

matrix transpose and simple moving average examples + exclusive_scan perf tests
2026-01-27 18:52:15 +00:00 · 2014-05-14 12:10:20 -06:00
parent b41ec2b1cb fed05b3ba5
commit 2fffd98c90
7 changed files with 585 additions and 2 deletions
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -20,6 +20,8 @@ set(EXAMPLES
  time_copy
  transform_sqrt
  vector_addition
+  simple_moving_average
+  matrix_transpose
 )

 if (${BOOST_COMPUTE_USE_OFFLINE_CACHE})
--- a/example/matrix_transpose.cpp
+++ b/example/matrix_transpose.cpp
@@ -0,0 +1,321 @@
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/compute.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+
+namespace compute = boost::compute;
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+/// \fn _copyKernel
+/// \brief generate a copy kernel program
+compute::program _copyKernel(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void copy_kernel(__global const float *src, __global float *dst)
+        {
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+            
+            uint width = get_num_groups(0) * TILE_DIM;
+            
+            for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS)
+            {
+                dst[(y+i)*width +x] = src[(y+i)*width + x];
+            }
+        }
+    );
+    // create copy program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM
+            << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+    compute::program program = compute::program::build_with_source(source,context,options.str());
+    return program;
+}
+
+/// \fn _naiveTransposeKernel
+/// \brief generate a naive transpose kernel program
+compute::program _naiveTransposeKernel(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void naiveTranspose(__global const float *src, __global float *dst)
+        {
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+            
+            uint width = get_num_groups(0) * TILE_DIM;
+            
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
+            {
+                dst[x*width + y+i] = src[(y+i)*width + x];
+            }
+        }  
+   );
+    
+   // create naiveTranspose program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM
+            << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+    compute::program program = compute::program::build_with_source(source,context,options.str());
+    return program;
+}
+
+/// \fn _coalescedTransposeKernel
+/// \brief generate a coalesced transpose kernel program
+compute::program _coalescedTransposeKernel(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void coalescedTranspose(__global const float *src, __global float *dst)
+        {
+            __local float tile[TILE_DIM][TILE_DIM];
+            
+            // compute indexes
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+            
+            uint width = get_num_groups(0) * TILE_DIM;
+            
+            // load inside local memory
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
+            {
+                tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+            }
+            
+            barrier(CLK_LOCAL_MEM_FENCE);
+            
+            // transpose indexes
+            x = get_group_id(1) * TILE_DIM + get_local_id(0);
+            y = get_group_id(0) * TILE_DIM + get_local_id(1);
+            
+            // write output from local memory
+            for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
+            {
+                dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+            }
+            
+        }  
+   );
+    
+   // create naiveTranspose program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM
+            << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+    compute::program program = compute::program::build_with_source(source,context,options.str());
+    return program;
+}
+
+/// \fn _coalescedNoBankConflictsKernel
+/// \brief generate a coalesced withtout bank conflicts kernel program
+compute::program _coalescedNoBankConflictsKernel(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void coalescedNoBankConflicts(__global const float *src, __global float *dst)
+        {
+            // TILE_DIM+1 is here to avoid bank conflicts in local memory
+            __local float tile[TILE_DIM][TILE_DIM+1];
+            
+            // compute indexes
+            uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
+            uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
+            
+            uint width = get_num_groups(0) * TILE_DIM;
+            
+            // load inside local memory
+            for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS)
+            {
+                tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
+            }
+            
+            barrier(CLK_LOCAL_MEM_FENCE);
+            
+            // transpose indexes
+            x = get_group_id(1) * TILE_DIM + get_local_id(0);
+            y = get_group_id(0) * TILE_DIM + get_local_id(1);
+            // write output from local memory
+            for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS)
+            {
+                dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
+            }
+            
+        }  
+   );
+    
+   // create naiveTranspose program
+    std::stringstream options;
+    options << "-DTILE_DIM=" << TILE_DIM
+            << " -DBLOCK_ROWS=" << BLOCK_ROWS;
+    compute::program program = compute::program::build_with_source(source,context,options.str());
+    return program;
+}
+
+/// \fn _checkTransposition
+/// \brief Compare @a expectedResult to @a transposedMatrix
+bool _checkTransposition(const std::vector<float>& expectedResult, 
+                         uint size, 
+                         const std::vector<float>& transposedMatrix)
+{
+    for(uint i = 0 ; i < size ; ++i)
+    {
+        if(expectedResult[i] != transposedMatrix[i])
+        {
+            std::cout << "idx = " << i << " , expected " << expectedResult[i] <<
+            " , got " << transposedMatrix[i] << std::endl;
+            std::cout << "FAILED" << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+/// \fn _generateMatrix
+/// \brief generate a matrix inside @a in and do the tranposition inside @a transposeRef
+void _generateMatrix(std::vector<float>& in, std::vector<float>& transposeRef, uint nx, uint ny)
+{
+    // generate a matrix
+    for(uint i = 0 ; i < nx ; ++i)
+    {
+        for(uint j = 0 ; j < ny ; ++j)
+        {
+            in[i*ny + j] = i*ny + j; 
+        }
+    }
+    
+    // store transposed result
+    for(uint j = 0; j < ny ; ++j)
+    {
+        for(uint i = 0 ; i < nx ; ++i)
+        {
+            transposeRef[j*nx + i] = in[i*ny + j];
+        }
+    }
+}
+
+#define _BEGIN_TEST(name) std::cout << name << std::endl;
+#define _END    std::cout << std::endl;
+
+
+int main()
+{
+    const uint nx = 4096;
+    const uint ny = 4096;
+    
+    std::cout << "Matrix Size: " << nx << "x" << ny << std::endl;
+    std::cout << "Grid Size: " << nx/TILE_DIM << "x" << ny/TILE_DIM << " blocks" << std::endl;
+    std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
+    
+    std::cout << std::endl;
+    
+    const size_t g_workSize[2] = {nx,ny*BLOCK_ROWS/TILE_DIM};
+    const size_t l_workSize[2] = {TILE_DIM,BLOCK_ROWS};
+    
+    const uint size = nx * ny;
+    
+    std::vector<float> h_input(size);
+    std::vector<float> h_output(size);
+    std::vector<float> expectedResult(size);
+    _generateMatrix(h_input,expectedResult,nx,ny);
+    
+    // get the default device
+    compute::device device = compute::system::default_device();
+    
+    // create a context for the device
+    compute::context context(device);
+
+    // device vectors
+    compute::vector<float> d_input(size,context);
+    compute::vector<float> d_output(size,context);
+    
+    // command_queue with profiling
+    compute::command_queue queue(context, device,compute::command_queue::enable_profiling);
+    
+    // copy input data
+    compute::copy(h_input.begin(),h_input.end(),d_input.begin(),queue);
+    
+    compute::program copy_program = _copyKernel(context);
+    compute::kernel kernel(copy_program,"copy_kernel");
+    kernel.set_arg(0,d_input);
+    kernel.set_arg(1,d_output);
+    
+    compute::event start;
+    _BEGIN_TEST("Copy_Kernel");
+    start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
+    queue.finish();
+    uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    
+    std::cout << "\tElapsed: " << elapsed  << " ns"<< std::endl;
+    std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float)  / elapsed  << " GB/s" << std::endl; 
+    compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
+    
+    if(_checkTransposition(h_input,nx*ny,h_output))
+        std::cout << "\tStatus: Success" << std::endl;
+    else
+        std::cout << "\tStatus: Error" << std::endl;
+    _END
+    
+    _BEGIN_TEST("naiveTranspose")
+    
+    kernel = compute::kernel(_naiveTransposeKernel(context),"naiveTranspose");
+    kernel.set_arg(0,d_input);
+    kernel.set_arg(1,d_output);
+    
+    start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "\tElapsed: " << elapsed  << " ns"<< std::endl;
+    std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float)  / elapsed  << " GB/s" << std::endl; 
+    compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
+    
+    if(_checkTransposition(expectedResult,nx*ny,h_output))
+        std::cout << "\tStatus: Success" << std::endl;
+    else
+        std::cout << "\tStatus: Error" << std::endl;
+    _END
+    
+    _BEGIN_TEST("coalescedTranspose")
+    
+    kernel = compute::kernel(_coalescedTransposeKernel(context),"coalescedTranspose");
+    kernel.set_arg(0,d_input);
+    kernel.set_arg(1,d_output);
+    
+    start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "\tElapsed: " << elapsed  << " ns"<< std::endl;
+    std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float)  / elapsed  << " GB/s" << std::endl; 
+    
+    
+    compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
+    
+    if(_checkTransposition(expectedResult,nx*ny,h_output))
+        std::cout << "\tStatus: Success" << std::endl;
+    else
+        std::cout << "\tStatus: Error" << std::endl;
+    _END
+    
+    _BEGIN_TEST("coalescedNoBankConflicts")
+    
+    kernel = compute::kernel(_coalescedNoBankConflictsKernel(context),"coalescedNoBankConflicts");
+    kernel.set_arg(0,d_input);
+    kernel.set_arg(1,d_output);
+    
+    start = queue.enqueue_nd_range_kernel(kernel,2,0,g_workSize,l_workSize);
+    queue.finish();
+    elapsed = start.duration<boost::chrono::nanoseconds>().count();
+    std::cout << "\tElapsed: " << elapsed  << " ns"<< std::endl;
+    std::cout << "\tBandWidth: " << 2 * nx * ny *sizeof(float)  / elapsed  << " GB/s" << std::endl; 
+    
+    
+    compute::copy(d_output.begin(),d_output.end(),h_output.begin(),queue);
+    
+    if(_checkTransposition(expectedResult,nx*ny,h_output))
+        std::cout << "\tStatus: Success" << std::endl;
+    else
+        std::cout << "\tStatus: Error" << std::endl;
+    _END
+    
+    return 0;
+}
+
+
--- a/example/simple_moving_average.cpp
+++ b/example/simple_moving_average.cpp
@@ -0,0 +1,133 @@
+
+
+#include <iostream>
+#include <cstdlib>
+
+#include <boost/compute.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+
+namespace compute = boost::compute;
+
+
+
+/// warning precision is not precise due
+/// to the float error accumulation when size is large enough
+/// for more precision use double
+/// or a kahan sum else results can diverge
+/// from the CPU implementation
+compute::program _sma_program(const compute::context& context)
+{
+    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
+        __kernel void SMA(__global const float *scannedValues, int size, __global float *output, int wSize)
+        {
+            const int gid = get_global_id(0);
+            
+            float cumValues = 0.;
+            int endIdx = gid + wSize/2;
+            int startIdx = gid -1 - wSize/2;
+            
+            if(endIdx > size -1)
+                endIdx = size -1;
+            
+            cumValues += scannedValues[endIdx];
+            if(startIdx < 0)
+                startIdx = -1;
+            else
+                cumValues -= scannedValues[startIdx];
+            
+            output[gid] =(float)( cumValues / ( float )(endIdx - startIdx));
+        }
+        
+   );
+    
+   // create sma program
+    compute::program program = compute::program::build_with_source(source,context);
+    return program;
+}
+
+
+bool _check(const std::vector<float>& values, const std::vector<float>& smoothValues, unsigned int wSize)
+{
+    int size = values.size();
+    if(size != (int)smoothValues.size()) return false;
+    
+    int semiWidth = wSize/2;
+    
+    bool res = true;
+    for(int idx = 0 ; idx < size ; ++idx)
+    {
+        int start = std::max(idx - semiWidth,0);
+        int end = std::min(idx + semiWidth,size-1);
+        float res = 0;
+        for(int j = start ; j <= end ; ++j)
+        {
+            res+= values[j];
+        }
+        
+        res /= float(end - start +1);
+        
+        if(std::abs(res-smoothValues[idx]) > 1e-3)
+        {
+            std::cout << "idx = " << idx << " -- expected = " << res << " -- result = " << smoothValues[idx] << std::endl;
+            res = false;
+        }
+    }
+    
+    return res;
+}
+
+
+// generate a uniform law over [0,10[
+float myRand()
+{
+    static const double divisor = double(RAND_MAX)+1.;
+    return double(rand())/divisor * 10.;
+}
+
+int main()
+{
+
+    unsigned int size = 1024;
+    // wSize must be odd
+    unsigned int wSize = 21;
+    // get the default device
+    compute::device device = compute::system::default_device();
+    // create a context for the device
+    compute::context context(device);
+    // get the program
+    compute::program program = _sma_program(context);
+    
+    // create vector of random numbers on the host
+    std::vector<float> host_vector(size);
+    std::vector<float> host_result(size);
+    std::generate(host_vector.begin(), host_vector.end(), myRand);
+    
+    compute::vector<float> a(size,context);
+    compute::vector<float> b(size,context);
+    compute::vector<float> c(size,context);
+    compute::command_queue queue(context, device);
+    
+    compute::copy(host_vector.begin(),host_vector.end(),a.begin(),queue);
+    
+    // scan values
+    compute::inclusive_scan(a.begin(),a.end(),b.begin(),queue);
+    // sma kernel
+    compute::kernel kernel(program, "SMA");
+    kernel.set_arg(0,b.get_buffer());
+    kernel.set_arg(1,(int)b.size());
+    kernel.set_arg(2,c.get_buffer());
+    kernel.set_arg(3,(int)wSize);
+    
+    uint tpb = 128;
+    uint workSize = size;
+    queue.enqueue_1d_range_kernel(kernel,0,workSize,tpb);
+    
+    compute::copy(c.begin(),c.end(),host_result.begin(),queue);
+    
+    bool res = _check(host_vector,host_result,wSize);
+    std::string status = res ? "results are equivalent" : "GPU results differs from CPU one's";
+    std::cout << status << std::endl;
+    
+    return 0;
+}
+
--- a/perf/CMakeLists.txt
+++ b/perf/CMakeLists.txt
@@ -29,6 +29,7 @@ set(BENCHMARKS
  sort_float
  unique
  unique_copy
+  exclusive_scan
 )

 foreach(BENCHMARK ${BENCHMARKS})
@@ -70,6 +71,7 @@ if(${BOOST_COMPUTE_HAVE_CUDA})
    thrust_partial_sum
    thrust_saxpy
    thrust_sort
+    thrust_exclusive_scan
  )

  foreach(BENCHMARK ${CUDA_BENCHMARKS})
--- a/perf/perf_exclusive_scan.cpp
+++ b/perf/perf_exclusive_scan.cpp
@@ -0,0 +1,87 @@
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+
+    // setup context and queue for the default device
+    boost::compute::device device = boost::compute::system::default_device();
+    boost::compute::context context(device);
+    boost::compute::command_queue queue(context, device);
+    std::cout << "device: " << device.name() << std::endl;
+
+    // create vector of random numbers on the host
+    std::vector<int> host_vector(PERF_N);
+    std::generate(host_vector.begin(), host_vector.end(), rand_int);
+
+    // create vector on the device and copy the data
+    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::vector<int> device_res(PERF_N,context);
+    boost::compute::copy(
+        host_vector.begin(),
+        host_vector.end(),
+        device_vector.begin(),
+        queue
+    );
+
+    // sum vector
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(),
+            host_vector.end(),
+            device_vector.begin(),
+            queue
+        );
+
+        t.start();
+        boost::compute::exclusive_scan(
+            device_vector.begin(),
+            device_vector.end(),
+            device_res.begin(),
+            queue
+        );
+        queue.finish();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // verify sum is correct
+    std::partial_sum(
+        host_vector.begin(),
+        host_vector.end(),
+        host_vector.begin()
+    );
+
+    int device_sum = device_res.back();
+    // when scan is exclusive values are shifted by one on the left
+    // compared to a inclusive scan
+    int host_sum = host_vector[host_vector.size()-2];
+
+    if(device_sum != host_sum){
+        std::cout << "ERROR: "
+                  << "device_sum (" << device_sum << ") "
+                  << "!= "
+                  << "host_sum (" << host_sum << ")"
+                  << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
--- a/perf/perf_partial_sum.cpp
+++ b/perf/perf_partial_sum.cpp
@@ -42,6 +42,7 @@ int main(int argc, char *argv[])

    // create vector on the device and copy the data
    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::vector<int> device_res(PERF_N,context);
    boost::compute::copy(
        host_vector.begin(),
        host_vector.end(),
@@ -63,7 +64,7 @@ int main(int argc, char *argv[])
        boost::compute::partial_sum(
            device_vector.begin(),
            device_vector.end(),
-            device_vector.begin(),
+            device_res.begin(),
            queue
        );
        queue.finish();
@@ -78,7 +79,7 @@ int main(int argc, char *argv[])
        host_vector.begin()
    );

-    int device_sum = device_vector.back();
+    int device_sum = device_res.back();
    int host_sum = host_vector.back();

    if(device_sum != host_sum){
--- a/perf/perf_thrust_exclusive_scan.cu
+++ b/perf/perf_thrust_exclusive_scan.cu
@@ -0,0 +1,37 @@
+#include <algorithm>
+#include <cstdlib>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec = h_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_vec.begin());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}