matrix_fold.hpp
Go to the documentation of this file.
1 /*!
2  * \brief kernels for folding matrices with openCL
3  *
4  * \author O. Krause
5  * \date 2016
6  *
7  *
8  * \par Copyright 1995-2015 Shark Development Team
9  *
10  * <BR><HR>
11  * This file is part of Shark.
12  * <http://image.diku.dk/shark/>
13  *
14  * Shark is free software: you can redistribute it and/or modify
15  * it under the terms of the GNU Lesser General Public License as published
16  * by the Free Software Foundation, either version 3 of the License, or
17  * (at your option) any later version.
18  *
19  * Shark is distributed in the hope that it will be useful,
20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  * GNU Lesser General Public License for more details.
23  *
24  * You should have received a copy of the GNU Lesser General Public License
25  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
26  *
27  */
28 #ifndef REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP
29 #define REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP
30 
31 #include "../../expression_types.hpp"
32 #include "../../detail/traits.hpp"
33 #include <boost/compute/kernel.hpp>
34 #include <boost/compute/detail/meta_kernel.hpp>
35 #include <boost/compute/container/array.hpp>
36 #include <boost/compute/algorithm/copy_n.hpp>
37 namespace remora{namespace bindings{
38 
39 template<class F, class MatA, class Orientation>
40 void matrix_fold(matrix_expression<MatA, gpu_tag> const& A, typename F::result_type& value, Orientation, dense_tag) {
41  auto& queue = A().queue();
42  typedef typename F::result_type value_type;
43  boost::compute::detail::meta_kernel k("blas_matrix_fold");
44  std::size_t size1_index = k.add_arg<std::size_t>("size1");
45  std::size_t size2_index = k.add_arg<std::size_t>("size2");
46  boost::compute::array<value_type,1> device_result;
47  boost::compute::copy_n(&value, 1, device_result.begin(), queue);
48  device_result.front() = value;
49  F f;
50 
51  //read all tiles in the assigned rows and apply f
52  k << "__local " <<k.decl<value_type>("subfold")<< "[TILE_DIM][TILE_DIM+1];";
53  k << "subfold[get_local_id(0)][get_local_id(1)] = "<<device_result.begin()[0]<<';';
54  k << "for(uint i = get_local_id(0) ; i < size1; i += TILE_DIM){";
55  k << " for(uint j = get_local_id(1) ; j < size2; j += TILE_DIM){";
56  auto exprSubFold = k.expr<value_type>("subfold[get_local_id(0)][get_local_id(1)]");
57  k<< exprSubFold << '=' << f(exprSubFold,A()(k.expr<cl_uint>("i"),k.expr<cl_uint>("j")))<<";";
58  k<<"}}";
59  k << "barrier(CLK_LOCAL_MEM_FENCE);";//wait until all threads are done with copying
60  //sum up the rows
61  k << "if(get_local_id(0) == 0){";
62  k << " for(uint i = 1 ; i < TILE_DIM; ++i){";
63  k << " subfold[0][get_local_id(1)] ="
64  << f(
65  k.expr<value_type>("subfold[0][get_local_id(1)]"),
66  k.expr<value_type>("subfold[i][get_local_id(1)]")
67  )<<';';
68  k << " }";
69  k <<" if(get_local_id(1) == 0){";
70  k << " for(uint i = 1 ; i < TILE_DIM; ++i){";
71  k <<" subfold[0][0] =" << f(k.expr<value_type>("subfold[0][0]"),k.expr<value_type>("subfold[0][i]"))<<';';
72  k <<" }";
73  k <<device_result.begin()[0]<< "= subfold[0][0];";
74  k<< "}}";
75 
76  //compile kernel
77  std::size_t TILE_DIM = 1;
78  char const* options ="-DTILE_DIM=1";
79  boost::compute::kernel kernel = k.compile(queue.get_context(), options);
80  //enqueue kernel
81  kernel.set_arg(size1_index, A().size1());
82  kernel.set_arg(size2_index, A().size2());
83 
84  std::size_t global_work_size[2] = {TILE_DIM,TILE_DIM};
85  std::size_t local_work_size[2] = {TILE_DIM, TILE_DIM};
86  queue.enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
87  boost::compute::copy_n(device_result.begin(), 1, &value, queue);
88 }
89 }}
90 #endif