32 #ifndef REMORA_KERNELS_GPU_GEMV_HPP 33 #define REMORA_KERNELS_GPU_GEMV_HPP 35 #include "../../expression_types.hpp" 36 #include "../../detail/traits.hpp" 37 #include <boost/compute/kernel.hpp> 38 #include <boost/compute/detail/meta_kernel.hpp> 39 #include <boost/compute/functional/operator.hpp> 41 namespace remora{
namespace kernels{
44 template <
typename MatA,
typename VecX,
typename VecV>
46 matrix_expression<MatA, gpu_tag>
const& A,
47 vector_expression<VecX, gpu_tag>
const& x,
48 vector_expression<VecV, gpu_tag>& v,
49 typename VecV::value_type
const& alpha
51 REMORA_SIZE_CHECK(A().size1() == v().size());
52 REMORA_SIZE_CHECK(A().size2() == x().size());
55 typedef typename VecV::value_type value_type;
56 boost::compute::multiplies<value_type> prod;
57 boost::compute::detail::meta_kernel k(
"blas_gemv");
58 std::size_t alpha_index = k.add_arg<value_type>(
"alpha");
59 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
60 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
62 k <<
"__local " <<k.decl<value_type>(
"results")<<
"[TILE_DIM][TILE_DIM+2];";
63 k <<
"uint rowid = get_global_id(0);";
64 k <<
"results[get_local_id(0)][get_local_id(1)] = 0.0;";
65 k <<
"for(uint i = get_local_id(1) ; i < size2 && rowid < size1; i += TILE_DIM){";
66 auto exprRow = k.expr<cl_uint>(
"rowid");
67 auto exprCol = k.expr<cl_uint>(
"i");
68 k<<
" results[get_local_id(0)][get_local_id(1)] += "<< prod(A()(exprRow,exprCol),x()(exprCol))<<
";";
70 k <<
"barrier(CLK_LOCAL_MEM_FENCE);";
72 k <<
"if(get_local_id(1) == 0 && rowid < size1){";
73 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
74 k <<
" results[get_local_id(0)][0] +=results[get_local_id(0)][i];";
76 k << v()(exprRow) <<
"+= alpha * results[get_local_id(0)][0];";
80 std::size_t TILE_DIM = 16;
81 char const* options =
"-DTILE_DIM=16";
82 boost::compute::kernel kernel = k.compile(v().queue().get_context(), options);
84 kernel.set_arg(alpha_index, alpha);
85 kernel.set_arg(size1_index, A().size1());
86 kernel.set_arg(size2_index, A().size2());
88 std::size_t global_work_size[2] = {
89 ((A().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
92 std::size_t local_work_size[2] = {TILE_DIM,TILE_DIM};
93 v().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);