trsv.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief -
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_CLBLAS_TRSV_HPP
32 #define REMORA_KERNELS_CLBLAS_TRSV_HPP
33 
34 
35 #include "../../detail/traits.hpp"
36 #include "../../expression_types.hpp"
37 #include "../../detail/matrix_proxy_classes.hpp"
38 #include "../gemv.hpp"
39 #include <boost/compute/kernel.hpp>
40 #include <boost/compute/detail/meta_kernel.hpp>
41 #include <boost/compute/functional/operator.hpp> //for multiplies
42 
43 ///solves systems of triangular matrices with one left hand side
44 
45 namespace remora{namespace bindings {
46 struct trsv_kernel{
47  boost::compute::kernel kernel;
48  std::size_t start_index;
49  std::size_t end_index;
50  std::size_t unit_index;
51  std::size_t upper_index;
52 };
53 //Lower triangular - matrix(row-major)
54 template<class MatA, class VecB>
55 trsv_kernel createTRSVDiagBlockKernel(
56  matrix_expression<MatA, gpu_tag> const& A,
57  vector_expression<VecB, gpu_tag> &b,
58  char const* options
59 ){
60  typedef typename MatA::value_type value_typeA;
61  typedef typename VecB::value_type value_typeB;
62  boost::compute::multiplies<value_typeB> prod;
63 
64  boost::compute::detail::meta_kernel k("blas_trsv");
65  std::size_t start_index = k.add_arg<std::size_t>("start");//start of block of A
66  std::size_t end_index = k.add_arg<std::size_t>("end");//end of Block of A
67  std::size_t unit_index = k.add_arg<std::size_t>("unit");//whether A is unit triangular
68  std::size_t upper_index = k.add_arg<std::size_t>("upper");//whether A is upper triangular
69  // Local memory to fit a tile of A and the vector B
70  k << "__local " <<k.decl<value_typeA>("Asub")<< "[TILE_SIZE][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
71  k << "__local " <<k.decl<value_typeB>("Bsub")<< "[TILE_SIZE];\n";
72  k << "const ulong numWorkers = get_local_size(0);\n";
73  //ensure we are not reading out of bounds
74  k << "const ulong curTileA = end-start;\n";
75 
76  // Load tile of A into local memory
77  k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
78  k << " for(ulong j = 0; j < TILE_SIZE; j++){\n";
79  k << " Asub[i][j] ="<< A()(k.expr<cl_ulong>("min(end-1, start + i)"),k.expr<cl_ulong>("min(end-1, start + j)"))<<";\n";
80  k << " }\n";
81  k << "}\n";
82 
83 
84  // Load Tile of B into local memory, store columns of B as rows
85  k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
86  k << " Bsub[i] ="<< b()(k.expr<cl_ulong>("min(end-1,start + i)"))<<";\n";
87  k << "}\n";
88  // Synchronise to make sure everything is loaded
89  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
90 
91  // Loop over the values of a single tile
92  //lower-case
93  k << "if(!upper){\n";
94  k << " for(ulong i = 0; i < TILE_SIZE && get_local_id(0) == 0; ++i){\n";
95  k << " if(!unit){Bsub[i] /= Asub[i][i];}\n";
96  k << " for(ulong j = i+1; j < TILE_SIZE; ++j){\n";
97  k << " Bsub[j] -= "<< prod(k.expr<value_typeB>("Bsub[i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
98  k << " }\n";
99  k << " }\n";
100  k << "}else{\n";
101  //upper case
102  k << " for(ulong n = curTileA; n > 0 && get_local_id(0) == 0; --n){\n";
103  k << " ulong i = n-1;\n";
104  k << " if(!unit ){Bsub[i] /= Asub[i][i];}\n";
105  k << " for(ulong j = 0; j < i; j ++){\n";
106  k << " Bsub[j] -= "<< prod(k.expr<value_typeB>("Bsub[i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
107  k << " }\n";
108  k << " }\n";
109  k << "}\n";
110  // Synchronise before continuing
111  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
112  // Store the final results back in B
113  k << "for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
114  k << b()(k.expr<cl_ulong>("(start+i)"))<<" = Bsub[i];\n";
115  k << "}\n";
116 
117  boost::compute::kernel kernel = k.compile(b().queue().get_context(), options);
118  return {kernel,start_index,end_index,unit_index,upper_index};
119 }
120 
121 template <typename MatA, typename VecB, class Triangular>
122 void trsv_recursive(
123  matrix_expression<MatA, gpu_tag> const& Afull,
124  vector_expression<VecB, gpu_tag> & bfull,
125  trsv_kernel& kernel,
126  std::size_t start,
127  std::size_t end,
128  std::size_t tileSize,
129  std::size_t numWorkers,
130  Triangular t
131 ){
132 
133  std::size_t size = end-start;
134  //if the matrix is small enough call the computation kernel directly for the block
135  if(size <= tileSize){
136  //enqueue kernel with kernel args
137  kernel.kernel.set_arg(kernel.start_index, start);
138  kernel.kernel.set_arg(kernel.end_index, end);
139  kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
140  kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
141 
142  std::size_t global_work_size[2] = {numWorkers,1};
143  std::size_t local_work_size[2] = {numWorkers, 1};
144  bfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,nullptr, global_work_size, local_work_size);
145  return;
146  }
147  std::size_t numBlocks = (size+tileSize-1)/tileSize;
148  std::size_t split = numBlocks/2 * tileSize;
149  vector_range<VecB> bfront(bfull(),start,start+split);
150  vector_range<VecB> bback(bfull(),start+split,end);
151 
152  //otherwise run the kernel recursively
153  if(Triangular::is_upper){ //Upper triangular case
154  matrix_range<typename const_expression<MatA>::type > Aur(Afull(),start,start+split,start+split,end);
155  trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
156  kernels::gemv(Aur, bback, bfront, -1.0);
157  trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
158  }else{// Lower triangular caste
159  matrix_range<typename const_expression<MatA>::type> All(Afull(),start+split,end,start,start+split);
160  trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
161  kernels::gemv(All, bfront, bback, -1.0);
162  trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
163  }
164 }
165 
166 template <typename MatA, typename VecB, class Triangular>
167 void trsv_call(
168  matrix_expression<MatA, gpu_tag> const& A,
169  vector_expression<VecB, gpu_tag>& b,
170  Triangular,
171  left
172 ){
173  std::size_t const TileSize = 32;//size of the diagonal blocks where the single kernel runs
174  std::size_t const numWorkers = TileSize; //number of workers
175  char const* options ="-DTILE_SIZE=32ul";
176  auto kernel = bindings::createTRSVDiagBlockKernel(A,b,options);
177  trsv_recursive(A,b,kernel,0,A().size1(), TileSize, numWorkers, Triangular());
178 }
179 
180 template <typename MatA, typename VecB, class Triangular>
181 void trsv_call(
182  matrix_expression<MatA, gpu_tag> const& A,
183  vector_expression<VecB, gpu_tag>& b,
184  Triangular,
185  right
186 ){
187  matrix_transpose<typename const_expression<MatA>::type> transA(A());
188  trsv_call(transA,b,typename Triangular::transposed_orientation(),left());
189 }
190 }
191 namespace kernels{
192 //main kernel runs the kernel above recursively and calls gemv
193 template <class Triangular,class Side, typename MatA, typename VecB>
194 void trsv(
195  matrix_expression<MatA, gpu_tag> const& A,
196  vector_expression<VecB, gpu_tag>& b
197 ){
198  REMORA_SIZE_CHECK(A().size1() == A().size2());
199  REMORA_SIZE_CHECK(A().size2() == b().size());
200  bindings::trsv_call(A,b,Triangular(), Side());
201 }
202 }}
203 #endif