matrix_assign.hpp
Go to the documentation of this file.
1 /*!
2  * \brief Kernels for matrix-expression assignments on the gpu
3  *
4  * \author O. Krause
5  * \date 2016
6  *
7  *
8  * \par Copyright 1995-2015 Shark Development Team
9  *
10  * <BR><HR>
11  * This file is part of Shark.
12  * <http://image.diku.dk/shark/>
13  *
14  * Shark is free software: you can redistribute it and/or modify
15  * it under the terms of the GNU Lesser General Public License as published
16  * by the Free Software Foundation, either version 3 of the License, or
17  * (at your option) any later version.
18  *
19  * Shark is distributed in the hope that it will be useful,
20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  * GNU Lesser General Public License for more details.
23  *
24  * You should have received a copy of the GNU Lesser General Public License
25  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
26  *
27  */
28 #ifndef REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
29 #define REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
30 
31 #include "../../expression_types.hpp"
32 #include "../../detail/traits.hpp"
33 #include <boost/compute/kernel.hpp>
34 #include <boost/compute/detail/meta_kernel.hpp>
35 
36 namespace remora{namespace bindings{
37 
38 //////////////////////////////////////////////////////
39 ////Scalar Assignment to Matrix
40 /////////////////////////////////////////////////////
41 
42 // Explicitly iterating row major
43 template<class F, class M>
44 void matrix_assign(
45  matrix_expression<M, gpu_tag> &m,
46  typename M::value_type t,
47  row_major
48 ){
49  typedef typename M::value_type value_type;
50  boost::compute::detail::meta_kernel k("blas_matrix_assign_constant");
51  std::size_t t_index = k.add_arg<value_type>("t");
52 
53  //create source
54  auto exprRow=k.expr<cl_uint>("get_global_id(0)");
55  auto exprCol=k.expr<cl_uint>("get_global_id(1)");
56  k<< m()(exprRow,exprCol) <<'=' << F()(m()(exprRow,exprCol), k.var<value_type>("t"))<<";";
57  boost::compute::kernel kernel = k.compile(m().queue().get_context());
58  //enqueue kernel
59  kernel.set_arg(t_index, t);
60  std::size_t global_work_size[2] = {m().size1(), m().size2()};
61  m().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, nullptr);
62 }
63 
64 
65 ///////////////////////////////////////////////////////////////////////////////////////////
66 //////Matrix Assignment With Functor implementing +=,-=...
67 ///////////////////////////////////////////////////////////////////////////////////////////
68 
69 //dense-dense case row-major, row-major
70 template<class F, class M, class E>
71 void matrix_assign_functor(
72  matrix_expression<M, gpu_tag> &m,
73  matrix_expression<E, gpu_tag> const& e,
74  F f,
75  row_major, row_major,dense_tag, dense_tag
76 ) {
77  //create source
78  boost::compute::detail::meta_kernel k("blas_matrix_assign");
79  auto exprRow=k.expr<cl_uint>("get_global_id(0)");
80  auto exprCol=k.expr<cl_uint>("get_global_id(1)");
81  k<< m()(exprRow,exprCol) << '=' << f(m()(exprRow,exprCol),e()(exprRow,exprCol))<<";\n";
82  //enqueue kernel
83  boost::compute::kernel kernel = k.compile(m().queue().get_context());
84  std::size_t global_work_size[2] = {m().size1(), m().size2()};
85  m().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, nullptr);
86 }
87 
88 //dense-dense case row-major, column-major
89 template<class F,class M, class E>
90 void matrix_assign_functor(
91  matrix_expression<M, gpu_tag> &m,
92  matrix_expression<E, gpu_tag> const& e,
93  F f,
94  row_major, column_major,dense_tag, dense_tag
95 ) {
96  //Kernel is based on boost/compute/examples/matrix_transpose.cpp
97  typedef typename M::value_type value_type;
98  std::size_t TILE_DIM = 32;
99  char const* options ="-DTILE_DIM=32ul";
100  //There are usually not enough parallel worker threads in a local group
101  //to fill a tile. Therefore every parallel threads reads several elements.
102  //BLOCK_COLS are the number of parallel threads reading a column
103  //and must be a divisor of TILE_DIM
104  std::size_t BLOCK_COLS = 8;
105 
106 
107  //create source
108  boost::compute::detail::meta_kernel k("blas_matrix_assign_row_col");
109  //create local memory. we first copy a tile in local
110  // memory which gets the orientation right. Then we copy the tile
111  //to the target
112  // TILE_DIM+1 is here to avoid bank conflicts in local memory
113  std::size_t size1_index = k.add_arg<std::size_t>("size1");
114  std::size_t size2_index = k.add_arg<std::size_t>("size2");
115  k << "__local " <<k.decl<value_type>("tile")<< "[TILE_DIM][TILE_DIM+2];\n";
116  k << "uint base_row = get_group_id(0) * TILE_DIM;\n";
117  k << "uint base_col = get_group_id(1) * TILE_DIM;\n";
118  //copy indices, into local memory, note the change of direction
119  //also note that if we are on the boundary, the tile
120  // might be largerthan the amount of values to read
121  k << "uint maxDim1 = min(size1-base_row,TILE_DIM);\n";
122  k << "uint maxDim2 = min(size2-base_col,TILE_DIM);\n";
123  k << "for(uint i = get_local_id(1) ; i < maxDim2 && get_local_id(0) < maxDim1; i += get_local_size(1)){\n";
124  auto row_exp = k.expr<cl_uint>("(base_row+get_local_id(0))");
125  auto col_exp = k.expr<cl_uint>("(base_col+i)");
126  k << " tile[get_local_id(0)][i] =" << e()(row_exp, col_exp)<<";\n";
127  k << "}\n";
128  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with copying
129  // write output from local memory, again be sure that
130  // we do not write outside the feasible area
131  k << "for(uint i = get_local_id(1); i < maxDim1 && get_local_id(0) < maxDim2; i += get_local_size(1)){\n";
132  auto target = m()(k.expr<cl_uint>("(base_row + i)"), k.expr<cl_uint>("(base_col + get_local_id(0))"));
133  k << target << " = " <<f(target, k.expr<cl_uint>("tile[i][get_local_id(0)]"))<<";\n";
134  k << "}\n";
135 
136  //compile kernel
137 
138  boost::compute::kernel kernel = k.compile(m().queue().get_context(), options);
139 
140  //enqueue kernel
141  kernel.set_arg(size1_index, m().size1());
142  kernel.set_arg(size2_index, m().size2());
143  std::size_t global_work_size[2] = {(m().size1()+TILE_DIM-1) / TILE_DIM * TILE_DIM, (m().size2()+TILE_DIM-1) / TILE_DIM * BLOCK_COLS };
144  std::size_t local_work_size[2] = {TILE_DIM, BLOCK_COLS};
145  m().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
146 }
147 
148 /////////////////////////////////////////////////////////////////
149 //////Matrix Assignment implementing op=
150 ////////////////////////////////////////////////////////////////
151 
152 //implement by using the assigner function below and call the functions above
153 
154 namespace detail {
155 struct assigner{
156  template<class Arg1, class Arg2>
157  Arg2 operator()(Arg1 const&, Arg2 const& y) const{
158  return y;
159  }
160 };
161 }
162 
163 template<class M, class E>
164 void matrix_assign(
165  matrix_expression<M, gpu_tag> &m,
166  matrix_expression<E, gpu_tag> const& e,
167  row_major o, row_major,dense_tag t, dense_tag
168 ) {
169  matrix_assign_functor(m,e,detail::assigner(),o,o,t,t);
170 }
171 
172 //dense-dense case
173 template<class M, class E>
174 void matrix_assign(
175  matrix_expression<M, gpu_tag> &m,
176  matrix_expression<E, gpu_tag> const& e,
177  row_major o1, column_major o2,dense_tag t, dense_tag
178 ) {
179  matrix_assign_functor(m,e,detail::assigner(),o1,o2,t,t);
180 }
181 
182 
183 }}
184 
185 #endif