28 #ifndef REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP 29 #define REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP 31 #include "../../expression_types.hpp" 32 #include "../../detail/traits.hpp" 33 #include <boost/compute/kernel.hpp> 34 #include <boost/compute/detail/meta_kernel.hpp> 35 #include <boost/compute/container/array.hpp> 36 #include <boost/compute/algorithm/copy_n.hpp> 37 namespace remora{
namespace bindings{
39 template<
class F,
class MatA,
class Orientation>
40 void matrix_fold(matrix_expression<MatA, gpu_tag>
const& A,
typename F::result_type& value, Orientation, dense_tag) {
41 auto& queue = A().queue();
42 typedef typename F::result_type value_type;
43 boost::compute::detail::meta_kernel k(
"blas_matrix_fold");
44 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
45 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
46 boost::compute::array<value_type,1> device_result;
47 boost::compute::copy_n(&value, 1, device_result.begin(), queue);
48 device_result.front() = value;
52 k <<
"__local " <<k.decl<value_type>(
"subfold")<<
"[TILE_DIM][TILE_DIM+1];";
53 k <<
"subfold[get_local_id(0)][get_local_id(1)] = "<<device_result.begin()[0]<<
';';
54 k <<
"for(uint i = get_local_id(0) ; i < size1; i += TILE_DIM){";
55 k <<
" for(uint j = get_local_id(1) ; j < size2; j += TILE_DIM){";
56 auto exprSubFold = k.expr<value_type>(
"subfold[get_local_id(0)][get_local_id(1)]");
57 k<< exprSubFold <<
'=' << f(exprSubFold,A()(k.expr<cl_uint>(
"i"),k.expr<cl_uint>(
"j")))<<
";";
59 k <<
"barrier(CLK_LOCAL_MEM_FENCE);";
61 k <<
"if(get_local_id(0) == 0){";
62 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
63 k <<
" subfold[0][get_local_id(1)] =" 65 k.expr<value_type>(
"subfold[0][get_local_id(1)]"),
66 k.expr<value_type>(
"subfold[i][get_local_id(1)]")
69 k <<
" if(get_local_id(1) == 0){";
70 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
71 k <<
" subfold[0][0] =" << f(k.expr<value_type>(
"subfold[0][0]"),k.expr<value_type>(
"subfold[0][i]"))<<
';';
73 k <<device_result.begin()[0]<<
"= subfold[0][0];";
77 std::size_t TILE_DIM = 1;
78 char const* options =
"-DTILE_DIM=1";
79 boost::compute::kernel kernel = k.compile(queue.get_context(), options);
81 kernel.set_arg(size1_index, A().size1());
82 kernel.set_arg(size2_index, A().size2());
84 std::size_t global_work_size[2] = {TILE_DIM,TILE_DIM};
85 std::size_t local_work_size[2] = {TILE_DIM, TILE_DIM};
86 queue.enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);
87 boost::compute::copy_n(device_result.begin(), 1, &value, queue);