trmv.hpp
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief -
6  *
7  * \author O. Krause
8  * \date 2016
9  *
10  *
11  * \par Copyright 1995-2015 Shark Development Team
12  *
13  * <BR><HR>
14  * This file is part of Shark.
15  * <http://image.diku.dk/shark/>
16  *
17  * Shark is free software: you can redistribute it and/or modify
18  * it under the terms of the GNU Lesser General Public License as published
19  * by the Free Software Foundation, either version 3 of the License, or
20  * (at your option) any later version.
21  *
22  * Shark is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25  * GNU Lesser General Public License for more details.
26  *
27  * You should have received a copy of the GNU Lesser General Public License
28  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29  *
30  */
31 //===========================================================================
32 #ifndef REMORA_KERNELS_CLBLAS_TRMV_HPP
33 #define REMORA_KERNELS_CLBLAS_TRMV_HPP
34 
35 
36 #include "../../expression_types.hpp"
37 #include "../../detail/traits.hpp"
38 #include <boost/compute/kernel.hpp>
39 #include <boost/compute/detail/meta_kernel.hpp>
40 #include <boost/compute/functional/operator.hpp> //for multiplies
41 #include "../gemv.hpp"
42 
43 namespace remora {namespace bindings {
44 
45 struct trmv_kernel{
46  boost::compute::kernel kernel;
47  std::size_t start_index;
48  std::size_t end_index;
49  std::size_t unit_index;
50  std::size_t upper_index;
51 };
52 //Lower triangular - matrix(row-major)
53 template<class MatA, class VecV>
54 trmv_kernel createTRMVBlockKernel(
55  matrix_expression<MatA, gpu_tag> const& A,
56  vector_expression<VecV, gpu_tag> &v,
57  char const* options
58 ){
59  typedef typename MatA::value_type value_typeA;
60  typedef typename VecV::value_type value_typeV;
61  boost::compute::multiplies<value_typeV> prod;
62 
63  boost::compute::detail::meta_kernel k("blas_trmv");
64  std::size_t start_index = k.add_arg<std::size_t>("start");//start of block of A
65  std::size_t end_index = k.add_arg<std::size_t>("end");//end of Block of A
66  std::size_t unit_index = k.add_arg<std::size_t>("unit");//whether A is unit triangular
67  std::size_t upper_index = k.add_arg<std::size_t>("upper");//whether A is unit triangular
68  // Local memory to fit a tile of A and B
69  // we store B as column major in local memory
70  // we also allocate memory to store results of B
71  k << "__local " <<k.decl<value_typeA>("Asub")<< "[TILE_SIZE][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
72  k << "__local " <<k.decl<value_typeV>("Bsub")<< "[TILE_SIZE];\n";
73  k << "__local " <<k.decl<value_typeV>("BResult")<< "[TILE_SIZE];\n";
74  k << " const ulong numWorkers = get_local_size(0);\n";
75 
76  // Load tile of A into local memory
77  k << "const ulong curTileA = end-start;\n";
78  k << "for(ulong i = 0; i < curTileA; ++i){\n";
79  k << " for(ulong j = get_local_id(0); j < curTileA; j += numWorkers){\n";
80  k << " Asub[i][j] ="<< A()(k.expr<cl_ulong>("(i+start)"),k.expr<cl_ulong>("(j+start)"))<<";\n";
81  k << " }\n";
82  k << "}\n";
83 
84  //ensure we are not reading out of bounds
85  // Load Tile of B into local memory, store columns of B as rows
86  k << "for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
87  k << " Bsub[i] = "<< v()(k.expr<cl_ulong>("(start+i)"))<<";\n";
88  k << "}\n";
89  // Synchronise to make sure the tile is loaded
90  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
91 
92  // Loop over the values of a single tile
93  // by computing outer products ulongo the local accumulation registers acc
94  //lower-case
95  k << "if(!upper){\n";
96  k << " for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
97  k << " BResult[i] = Bsub[i];\n";
98  k << " if(!unit){BResult[i] *= Asub[i][i];}\n";
99  k << " for(ulong j = 0; j < i; ++j){\n";
100  k << " BResult[i] +="<< prod(k.expr<value_typeV>("Bsub[j]"), k.expr<value_typeA>("Asub[i][j]"))<<";\n";
101  k << " }\n";
102  k << " }\n";
103  k << "}else{\n";
104  //upper case
105  k << " for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
106  k << " BResult[i] = Bsub[i];\n";
107  k << " if(!unit){BResult[i] *= Asub[i][i];}\n";
108  k << " for(ulong j = i+1; j < curTileA; ++j){\n";
109  k << " BResult[i] +="<< prod(k.expr<value_typeV>("Bsub[j]"), k.expr<value_typeA>("Asub[i][j]"))<<";\n";
110  k << " }\n";
111  k << " }\n";
112  k << "}\n";
113  //~ // Synchronise before loading the next tile
114  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
115  // Store the final results back in B
116  k << "for(ulong i = 0; i < curTileA; ++i){\n";
117  k << v()(k.expr<cl_ulong>("(start+i)"))<<" = BResult[i];\n";
118  k << "}\n";
119 
120  boost::compute::kernel kernel = k.compile(v().queue().get_context(), options);
121  return {kernel,start_index,end_index,unit_index,upper_index};
122 }
123 
124 template <typename MatA, typename VecV, typename Triangular>
125 void trmv_recursive(
126  matrix_expression<MatA, gpu_tag> const& Afull,
127  vector_expression<VecV, gpu_tag> & vfull,
128  trmv_kernel& kernel,
129  std::size_t start,
130  std::size_t end,
131  std::size_t tileSizeA,
132  Triangular t
133 ){
134  std::size_t size = end-start;
135 
136  //if the matrix is small enough, call the computation kernel directly for the block
137  if(size <= tileSizeA){
138  //enqueue kernel with kernel args
139  kernel.kernel.set_arg(kernel.start_index, start);
140  kernel.kernel.set_arg(kernel.end_index, end);
141  kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
142  kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
143 
144  std::size_t global_work_size[2] = {
145  (size+tileSizeA-1) / tileSizeA * tileSizeA,
146  1
147  };
148  std::size_t local_work_size[2] = {tileSizeA, 1};
149  vfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,nullptr, global_work_size, local_work_size);
150  return;
151  }
152  //otherwise run the kernel recursively
153  std::size_t split = ((size+tileSizeA-1)/tileSizeA)/2 * tileSizeA;//split at the next multiple of the TileSize
154  vector_range<VecV> vfront(vfull(),start,start+split);
155  vector_range<VecV> vback(vfull(),start+split,end);
156 
157  if(Triangular::is_upper){ //Upper triangular case
158  matrix_range<typename const_expression<MatA>::type > Aur(Afull(),start,start+split,start+split,end);
159  trmv_recursive(Afull, vfull, kernel, start, start+split, tileSizeA, t);
160  kernels::gemv(Aur, vback, vfront, 1.0);
161  trmv_recursive(Afull, vfull, kernel, start+split, end, tileSizeA, t);
162  }else{// Lower triangular caste
163  matrix_range<typename const_expression<MatA>::type> All(Afull(),start+split,end,start,start+split);
164  trmv_recursive(Afull, vfull, kernel, start+split, end, tileSizeA, t);
165  kernels::gemv(All, vfront, vback, 1.0);
166  trmv_recursive(Afull, vfull, kernel, start, start+split, tileSizeA, t);
167  }
168 
169 }
170 }
171 namespace kernels{
172 //main kernel runs the kernel above recursively and calls gemv
173 template <bool Upper,bool Unit,typename MatA, typename VecV>
174 void trmv(
175  matrix_expression<MatA, gpu_tag> const& A,
176  vector_expression<VecV, gpu_tag>& v
177 ){
178  REMORA_SIZE_CHECK(A().size1() == A().size2());
179  REMORA_SIZE_CHECK(A().size2() == v().size());
180 
181  std::size_t const TileSizeA = 32;//size of the diagonal blocks where the single kernel runs
182  char const* options ="-DTILE_SIZE=32ul";
183  auto kernel = bindings::createTRMVBlockKernel(A,v,options);
184 
185  bindings::trmv_recursive(A,v,kernel,0,A().size1(), TileSizeA, triangular_tag<Upper,Unit>());
186 
187 }
188 
189 }}
190 #endif