28 #ifndef REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP 29 #define REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP 31 #include "../../expression_types.hpp" 32 #include "../../detail/traits.hpp" 33 #include <boost/compute/kernel.hpp> 34 #include <boost/compute/detail/meta_kernel.hpp> 36 namespace remora{
namespace bindings{
43 template<
class F,
class M>
45 matrix_expression<M, gpu_tag> &m,
46 typename M::value_type t,
49 typedef typename M::value_type value_type;
50 boost::compute::detail::meta_kernel k(
"blas_matrix_assign_constant");
51 std::size_t t_index = k.add_arg<value_type>(
"t");
54 auto exprRow=k.expr<cl_uint>(
"get_global_id(0)");
55 auto exprCol=k.expr<cl_uint>(
"get_global_id(1)");
56 k<< m()(exprRow,exprCol) <<
'=' << F()(m()(exprRow,exprCol), k.var<value_type>(
"t"))<<
";";
57 boost::compute::kernel kernel = k.compile(m().queue().get_context());
59 kernel.set_arg(t_index, t);
60 std::size_t global_work_size[2] = {m().size1(), m().size2()};
61 m().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size,
nullptr);
70 template<
class F,
class M,
class E>
71 void matrix_assign_functor(
72 matrix_expression<M, gpu_tag> &m,
73 matrix_expression<E, gpu_tag>
const& e,
75 row_major, row_major,dense_tag, dense_tag
78 boost::compute::detail::meta_kernel k(
"blas_matrix_assign");
79 auto exprRow=k.expr<cl_uint>(
"get_global_id(0)");
80 auto exprCol=k.expr<cl_uint>(
"get_global_id(1)");
81 k<< m()(exprRow,exprCol) <<
'=' << f(m()(exprRow,exprCol),e()(exprRow,exprCol))<<
";\n";
83 boost::compute::kernel kernel = k.compile(m().queue().get_context());
84 std::size_t global_work_size[2] = {m().size1(), m().size2()};
85 m().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size,
nullptr);
89 template<
class F,
class M,
class E>
90 void matrix_assign_functor(
91 matrix_expression<M, gpu_tag> &m,
92 matrix_expression<E, gpu_tag>
const& e,
94 row_major, column_major,dense_tag, dense_tag
97 typedef typename M::value_type value_type;
98 std::size_t TILE_DIM = 32;
99 char const* options =
"-DTILE_DIM=32ul";
104 std::size_t BLOCK_COLS = 8;
108 boost::compute::detail::meta_kernel k(
"blas_matrix_assign_row_col");
113 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
114 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
115 k <<
"__local " <<k.decl<value_type>(
"tile")<<
"[TILE_DIM][TILE_DIM+2];\n";
116 k <<
"uint base_row = get_group_id(0) * TILE_DIM;\n";
117 k <<
"uint base_col = get_group_id(1) * TILE_DIM;\n";
121 k <<
"uint maxDim1 = min(size1-base_row,TILE_DIM);\n";
122 k <<
"uint maxDim2 = min(size2-base_col,TILE_DIM);\n";
123 k <<
"for(uint i = get_local_id(1) ; i < maxDim2 && get_local_id(0) < maxDim1; i += get_local_size(1)){\n";
124 auto row_exp = k.expr<cl_uint>(
"(base_row+get_local_id(0))");
125 auto col_exp = k.expr<cl_uint>(
"(base_col+i)");
126 k <<
" tile[get_local_id(0)][i] =" << e()(row_exp, col_exp)<<
";\n";
128 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
131 k <<
"for(uint i = get_local_id(1); i < maxDim1 && get_local_id(0) < maxDim2; i += get_local_size(1)){\n";
132 auto target = m()(k.expr<cl_uint>(
"(base_row + i)"), k.expr<cl_uint>(
"(base_col + get_local_id(0))"));
133 k << target <<
" = " <<f(target, k.expr<cl_uint>(
"tile[i][get_local_id(0)]"))<<
";\n";
138 boost::compute::kernel kernel = k.compile(m().queue().get_context(), options);
141 kernel.set_arg(size1_index, m().size1());
142 kernel.set_arg(size2_index, m().size2());
143 std::size_t global_work_size[2] = {(m().size1()+TILE_DIM-1) / TILE_DIM * TILE_DIM, (m().size2()+TILE_DIM-1) / TILE_DIM * BLOCK_COLS };
144 std::size_t local_work_size[2] = {TILE_DIM, BLOCK_COLS};
145 m().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);
156 template<
class Arg1,
class Arg2>
157 Arg2 operator()(Arg1
const&, Arg2
const& y)
const{
163 template<
class M,
class E>
165 matrix_expression<M, gpu_tag> &m,
166 matrix_expression<E, gpu_tag>
const& e,
167 row_major o, row_major,dense_tag t, dense_tag
169 matrix_assign_functor(m,e,detail::assigner(),o,o,t,t);
173 template<
class M,
class E>
175 matrix_expression<M, gpu_tag> &m,
176 matrix_expression<E, gpu_tag>
const& e,
177 row_major o1, column_major o2,dense_tag t, dense_tag
179 matrix_assign_functor(m,e,detail::assigner(),o1,o2,t,t);