trsm.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief -
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_CLBLAS_TRSM_HPP
32 #define REMORA_KERNELS_CLBLAS_TRSM_HPP
33 
34 #include "../../expression_types.hpp"
35 #include "../../detail/traits.hpp"
36 #include <boost/compute/kernel.hpp>
37 #include <boost/compute/detail/meta_kernel.hpp>
38 #include <boost/compute/functional/operator.hpp> //for multiplies
39 ///solves systems of triangular matrices
40 
41 namespace remora {namespace bindings {
42 struct trsm_kernel{
43  boost::compute::kernel kernel;
44  std::size_t K_index;
45  std::size_t start_index;
46  std::size_t end_index;
47  std::size_t unit_index;
48  std::size_t upper_index;
49 };
50 //Lower triangular - matrix(row-major)
51 template<class MatA, class MatB>
52 trsm_kernel createTRSMDiagBlockKernel(
53  matrix_expression<MatA, gpu_tag> const& A,
54  matrix_expression<MatB, gpu_tag> &B,
55  char const* options
56 ){
57  typedef typename MatA::value_type value_typeA;
58  typedef typename MatB::value_type value_typeB;
59  boost::compute::multiplies<value_typeB> prod;
60 
61  boost::compute::detail::meta_kernel k("blas_trsm");
62  std::size_t K_index = k.add_arg<std::size_t>("K");//number of columns in B
63  std::size_t start_index = k.add_arg<std::size_t>("start");//start of block of A
64  std::size_t end_index = k.add_arg<std::size_t>("end");//end of Block of A
65  std::size_t unit_index = k.add_arg<std::size_t>("unit");//whether A is unit triangular
66  std::size_t upper_index = k.add_arg<std::size_t>("upper");//whether A is upper triangular
67  // Local memory to fit a tile of A and B
68  // we store B as column major in local memory
69  // we also allocate memory to store results of B
70  k << "__local " <<k.decl<value_typeA>("Asub")<< "[TILE_SIZE][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
71  k << "__local " <<k.decl<value_typeB>("Bsub")<< "[TILE_SIZE_K][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
72  k << "const ulong numWorkers = get_local_size(0);\n";
73  //ensure we are not reading out of bounds
74  k << "const ulong t = get_group_id(1);\n";
75  k << "const ulong curTileA = end-start;\n";
76  k << "const ulong curTileK = min(TILE_SIZE_K, K - t*TILE_SIZE_K);\n";
77 
78  // Load tile of A into local memory
79  k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
80  k << " for(ulong j = get_local_id(1); j < TILE_SIZE; j += numWorkers){\n";
81  k << " Asub[i][j] ="<< A()(k.expr<cl_ulong>("min(end-1, start + i)"),k.expr<cl_ulong>("min(end-1, start + j)"))<<";\n";
82  k << " }\n";
83  k << "}\n";
84 
85 
86  // Load Tile of B into local memory, store columns of B as rows
87  k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
88  k << " for(ulong k = get_local_id(1); k < TILE_SIZE_K; k += numWorkers){\n";
89  k << " Bsub[k][i] ="<< B()(k.expr<cl_ulong>("min(end-1,start + i)"),k.expr<cl_ulong>("min(K-1,t * TILE_SIZE_K+k)"))<<";\n";
90  k << " }\n";
91  k << "}\n";
92  // Synchronise to make sure the tiles are loaded
93  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
94 
95  // Loop over the values of a single tile
96  //lower-case
97  k << "if(!upper){\n";
98  k << " for(ulong k = get_local_id(1); k < curTileK; k += numWorkers){\n";
99  k << " for(ulong i = 0; i < TILE_SIZE && get_local_id(0) == 0; ++i){\n";
100  k << " if(!unit){Bsub[k][i] /= Asub[i][i];}\n";
101  k << " for(ulong j = i+1; j < TILE_SIZE; ++j){\n";
102  k << " Bsub[k][j] -= "<< prod(k.expr<value_typeB>("Bsub[k][i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
103  k << " }\n";
104  k << " }\n";
105  k << " }\n";
106  k << "}else{\n";
107  //upper case
108  k << " for(ulong k = get_local_id(1); k < curTileK; k += numWorkers){\n";
109  k << " for(ulong n = curTileA; n > 0 && get_local_id(0) == 0; --n){\n";
110  k << " ulong i = n-1;\n";
111  k << " if(!unit ){Bsub[k][i] /= Asub[i][i];}\n";
112  k << " for(ulong j = 0; j < i; j ++){\n";
113  k << " Bsub[k][j] -= "<< prod(k.expr<value_typeB>("Bsub[k][i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
114  k << " }\n";
115  k << " }\n";
116  k << " }\n";
117  k << "}\n";
118  // Synchronise before continuing
119  k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
120  // Store the final results back in B
121  k << "for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
122  k << " for(ulong k = get_local_id(1); k < curTileK; k += numWorkers){\n";
123  k << B()(k.expr<cl_ulong>("(start+i)"),k.expr<cl_ulong>("(t * TILE_SIZE_K+k)"))<<" = Bsub[k][i];\n";
124  k << " }\n";
125  k << "}\n";
126 
127  boost::compute::kernel kernel = k.compile(B().queue().get_context(), options);
128  return {kernel,K_index,start_index,end_index,unit_index,upper_index};
129 }
130 
131 template <typename MatA, typename MatB, class Triangular>
132 void trsm_recursive(
133  matrix_expression<MatA, gpu_tag> const& Afull,
134  matrix_expression<MatB, gpu_tag> & Bfull,
135  trsm_kernel& kernel,
136  std::size_t start,
137  std::size_t end,
138  std::size_t tileSizeA,
139  std::size_t tileSizeB,
140  std::size_t numWorkers,
141  Triangular t
142 ){
143  auto A = subrange(Afull,start,end,start,end);
144  auto B = rows(Bfull,start,end);
145  std::size_t size = A.size1();
146  //if the matrix is small enough call the computation kernel directly for the block
147  if(size <= tileSizeA){
148  //enqueue kernel with kernel args
149  kernel.kernel.set_arg(kernel.K_index, Bfull().size2());
150  kernel.kernel.set_arg(kernel.start_index, start);
151  kernel.kernel.set_arg(kernel.end_index, end);
152  kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
153  kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
154 
155  std::size_t global_work_size[2] = {
156  numWorkers,
157  (Bfull().size2()+tileSizeB-1)/ tileSizeB * numWorkers
158  };
159  std::size_t local_work_size[2] = {numWorkers, numWorkers};
160  Bfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,nullptr, global_work_size, local_work_size);
161  return;
162  }
163  std::size_t numBlocks = (A.size1()+tileSizeA-1)/tileSizeA;
164  std::size_t split = numBlocks/2 * tileSizeA;
165  auto Bfront = rows(B,0,split);
166  auto Bback = rows(B,split,size);
167 
168  //otherwise run the kernel recursively
169  if(Triangular::is_upper){ //Upper triangular case
170  trsm_recursive(Afull, Bfull, kernel, start+split,end, tileSizeA,tileSizeB, numWorkers, t);
171  kernels::gemm(subrange(A,0,split,split,size), Bback, Bfront, -1.0);
172  trsm_recursive(Afull, Bfull, kernel, start,start+split, tileSizeA,tileSizeB, numWorkers, t);
173  }else{// Lower triangular caste
174  trsm_recursive(Afull, Bfull, kernel, start,start+split, tileSizeA,tileSizeB, numWorkers, t);
175  kernels::gemm(subrange(A,split,size,0,split), Bfront, Bback, -1.0);
176  trsm_recursive(Afull, Bfull, kernel, start+split,end, tileSizeA,tileSizeB, numWorkers, t);
177  }
178 }
179 
180 template <typename MatA, typename MatB, class Triangular>
181 void trsm_call(
182  matrix_expression<MatA, gpu_tag> const& A,
183  matrix_expression<MatB, gpu_tag>& B,
184  Triangular,
185  left
186 ){
187  REMORA_SIZE_CHECK(A().size1() == A().size2());
188  REMORA_SIZE_CHECK(A().size2() == B().size1());
189  std::size_t const TileSizeA = 32;//size of the diagonal blocks where the single kernel runs
190  std::size_t const TileSizeB = 32;// size of the blocks B is partitioned into along the number of columns
191  std::size_t const numWorkers = 8; //number of workers in two dimensions (e.g. 8x8=64)
192  char const* options ="-DTILE_SIZE=32ul -DTILE_SIZE_K=32ul";
193  auto kernel = bindings::createTRSMDiagBlockKernel(A,B,options);
194 
195  trsm_recursive(A,B,kernel,0,A().size1(), TileSizeA, TileSizeB, numWorkers,Triangular());
196 }
197 
198 template <typename MatA, typename MatB, class Triangular>
199 void trsm_call(
200  matrix_expression<MatA, gpu_tag> const& A,
201  matrix_expression<MatB, gpu_tag>& B,
202  Triangular,
203  right
204 ){
205  matrix_transpose<typename const_expression<MatA>::type> transA(A());
206  matrix_transpose<MatB> transB(B());
207  trsm_call(transA,transB,typename Triangular::transposed_orientation(),left());
208 }
209 
210 }
211 namespace kernels{
212 //main kernel runs the kernel above recursively and calls gemv
213 template <class Triangular, class Side, typename MatA, typename MatB>
214 void trsm(
215  matrix_expression<MatA, gpu_tag> const& A,
216  matrix_expression<MatB, gpu_tag>& B
217 ){
218  bindings::trsm_call(A,B,Triangular(), Side());
219 }
220 }}
221 #endif