gemv.hpp
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief -
6  *
7  * \author O. Krause
8  * \date 2016
9  *
10  *
11  * \par Copyright 1995-2015 Shark Development Team
12  *
13  * <BR><HR>
14  * This file is part of Shark.
15  * <http://image.diku.dk/shark/>
16  *
17  * Shark is free software: you can redistribute it and/or modify
18  * it under the terms of the GNU Lesser General Public License as published
19  * by the Free Software Foundation, either version 3 of the License, or
20  * (at your option) any later version.
21  *
22  * Shark is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25  * GNU Lesser General Public License for more details.
26  *
27  * You should have received a copy of the GNU Lesser General Public License
28  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29  *
30  */
31 //===========================================================================
32 #ifndef REMORA_KERNELS_GPU_GEMV_HPP
33 #define REMORA_KERNELS_GPU_GEMV_HPP
34 
35 #include "../../expression_types.hpp"
36 #include "../../detail/traits.hpp"
37 #include <boost/compute/kernel.hpp>
38 #include <boost/compute/detail/meta_kernel.hpp>
39 #include <boost/compute/functional/operator.hpp> //for multiplies
40 
41 namespace remora{ namespace kernels{
42 
43 // v <- v + alpha * A * x
44 template <typename MatA, typename VecX, typename VecV>
45 void gemv(
46  matrix_expression<MatA, gpu_tag> const& A,
47  vector_expression<VecX, gpu_tag> const& x,
48  vector_expression<VecV, gpu_tag>& v,
49  typename VecV::value_type const& alpha
50 ) {
51  REMORA_SIZE_CHECK(A().size1() == v().size());
52  REMORA_SIZE_CHECK(A().size2() == x().size());
53 
54 
55  typedef typename VecV::value_type value_type;
56  boost::compute::multiplies<value_type> prod;
57  boost::compute::detail::meta_kernel k("blas_gemv");
58  std::size_t alpha_index = k.add_arg<value_type>("alpha");
59  std::size_t size1_index = k.add_arg<std::size_t>("size1");
60  std::size_t size2_index = k.add_arg<std::size_t>("size2");
61  //read all tiles in the assigned rows and compute the inner product
62  k << "__local " <<k.decl<value_type>("results")<< "[TILE_DIM][TILE_DIM+2];";
63  k << "uint rowid = get_global_id(0);";
64  k << "results[get_local_id(0)][get_local_id(1)] = 0.0;";
65  k << "for(uint i = get_local_id(1) ; i < size2 && rowid < size1; i += TILE_DIM){";
66  auto exprRow = k.expr<cl_uint>("rowid");
67  auto exprCol = k.expr<cl_uint>("i");
68  k<< " results[get_local_id(0)][get_local_id(1)] += "<< prod(A()(exprRow,exprCol),x()(exprCol))<<";";
69  k<<'}';
70  k << "barrier(CLK_LOCAL_MEM_FENCE);";//wait until all threads are done with computing
71  //sum up the rows
72  k << "if(get_local_id(1) == 0 && rowid < size1){";
73  k << " for(uint i = 1 ; i < TILE_DIM; ++i){";
74  k << " results[get_local_id(0)][0] +=results[get_local_id(0)][i];";
75  k << " }";
76  k << v()(exprRow) << "+= alpha * results[get_local_id(0)][0];";
77  k<< "}";
78  //create source
79 
80  std::size_t TILE_DIM = 16;
81  char const* options ="-DTILE_DIM=16";
82  boost::compute::kernel kernel = k.compile(v().queue().get_context(), options);
83  //enqueue kernel
84  kernel.set_arg(alpha_index, alpha);
85  kernel.set_arg(size1_index, A().size1());
86  kernel.set_arg(size2_index, A().size2());
87 
88  std::size_t global_work_size[2] = {
89  ((A().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
90  TILE_DIM
91  };
92  std::size_t local_work_size[2] = {TILE_DIM,TILE_DIM};
93  v().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
94 }
95 
96 }}
97 
98 #endif