syrk.hpp
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief -
6  *
7  * \author O. Krause
8  * \date 2016
9  *
10  *
11  * \par Copyright 1995-2015 Shark Development Team
12  *
13  * <BR><HR>
14  * This file is part of Shark.
15  * <http://image.diku.dk/shark/>
16  *
17  * Shark is free software: you can redistribute it and/or modify
18  * it under the terms of the GNU Lesser General Public License as published
19  * by the Free Software Foundation, either version 3 of the License, or
20  * (at your option) any later version.
21  *
22  * Shark is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25  * GNU Lesser General Public License for more details.
26  *
27  * You should have received a copy of the GNU Lesser General Public License
28  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29  *
30  */
31 //===========================================================================
32 #ifndef REMORA_KERNELS_CLBLAS_SYRK_HPP
33 #define REMORA_KERNELS_CLBLAS_SYRK_HPP
34 
35 #include "../../expression_types.hpp"
36 #include "../../detail/traits.hpp"
37 #include <boost/compute/kernel.hpp>
38 #include <boost/compute/detail/meta_kernel.hpp>
39 #include <boost/compute/functional/operator.hpp> //for multiplies
40 
41 namespace remora{ namespace kernels{
42 
43 // C <- alpha * A * B + beta * C
44 template <bool Upper, typename MatA, typename MatC>
45 void syrk(
46  matrix_expression<MatA, gpu_tag> const& A,
47  matrix_expression<MatC, gpu_tag>& C,
48  typename MatC::value_type const& alpha
49 ) {
50  REMORA_SIZE_CHECK(A().size1() == C().size1());
51  REMORA_SIZE_CHECK(C().size1()== C().size2());
52 
53  // TUNING VARIABLES:
54  // TILE_SIZE: width and height of a tile computed in C
55  // TILE_SIZE_K: length of Tile loaded from A, i.e A is loaded as (TILE_SIZE, TILE_SIZE_K) matrix
56  // BLOCK_SIZE: size of a subblock computed by a single work item (must divide TILE_SIZE)
57  // it holds TILE_SIZE = get_local_size(i) * BLOCK_SIZE for i=0,1
58  // note that BLOCK_SIZE increases the number of registers needed per thread.
59  // roughly O(BLOCK_SIZE^2) registers are needed per thread (+ overhead). Note that register spill might be deadly to performance.
60  // local memory is TILE_SIZE*TILE_SIZE_K*2*sizeof(MatC::value_type)
61  // increasing TILE_SIZE improves the trade-off "computations per memory access"
62  std::size_t BLOCK_SIZE = 4;
63  std::size_t TILE_SIZE = 32;
64  std::size_t NUM_WORKERS = TILE_SIZE / BLOCK_SIZE;
65  char const* options ="-DTILE_SIZE=32ul -DBLOCK_SIZE=4ul -DTILE_SIZE_K=16ul";
66  typedef typename MatC::value_type value_type;
67 
68  boost::compute::detail::meta_kernel k("blas_syrk");
69  std::size_t N_index = k.add_arg<std::size_t>("N");
70  std::size_t K_index = k.add_arg<std::size_t>("K");
71  std::size_t upper_index = k.add_arg<std::size_t>("upper");
72  std::size_t alpha_index = k.add_arg<value_type>("alpha");
73  //check whether we are in a block that is not touched by syrk
74  k <<"if((upper && get_group_id(1) < get_group_id(0))) return;\n";
75  k <<"if((!upper && get_group_id(1) > get_group_id(0))) return;\n";
76 
77  // From now on this the normal gemm kernel with the difference that A and B are the same (but transposed) matrices.
78  // Local memory to fit a tile of A and B
79  // we transpose A locally in memory
80  k << "__local " <<k.decl<value_type>("Asub")<< "[TILE_SIZE_K][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
81  k << "__local " <<k.decl<value_type>("Bsub")<< "[TILE_SIZE_K][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
82  k << " const ulong numWorkers = get_local_size(0);\n";
83  // Initialise the accumulation registers
84  // here the subblock of C for this thread is stored
85  // blocks ae not continuous but strided so that
86  // coalesced write to C is possible
87  // e.g. with 8x8 threads and BLOCK_SIZE 2x2 thread 1 has local tile elements
88  //(0,0) (0,8) (8,0), (8,8). all other blocks are calculated
89  // by adding (local_id(0), local_id(1)) to the coordinates
90  k << k.decl<value_type>("acc") <<"[BLOCK_SIZE][BLOCK_SIZE];\n";
91  k << "for (ulong wm=0; wm<BLOCK_SIZE; wm++){\n";
92  k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
93  k << " acc[wm][wn] = 0.0f;\n";
94  k << " }\n";
95  k << "}\n";
96 
97 
98  // Loop over all tiles
99  k << "ulong numTiles = (K+TILE_SIZE_K-1)/TILE_SIZE_K;\n";
100  k << "for (ulong t=0; t<numTiles; t++){\n";
101 
102  //ensure we are not reading out of bounds in K.
103  k << " const ulong curTileK = min(TILE_SIZE_K, K - t*TILE_SIZE_K);\n";
104 
105  // Load one tile of A and B transposed ulongo local memory using padding
106  k << " for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
107  k << " for(ulong k = get_local_id(1); k < curTileK; k += numWorkers){\n";
108  k << " ulong ktile = t * TILE_SIZE_K + k;\n";
109  k << " Asub[k][i] ="<< A()(k.expr<cl_ulong>("min(N-1,TILE_SIZE * get_group_id(0)+i)"), k.expr<cl_ulong>("ktile"))<<";\n";
110  k << " Bsub[k][i] ="<< A()(k.expr<cl_ulong>("min(N-1,TILE_SIZE * get_group_id(1)+i)"), k.expr<cl_ulong>("ktile"))<<";\n";
111  k << " }\n";
112  k << " }\n";
113 
114  // Synchronise to make sure the tile is loaded
115  k << " barrier(CLK_LOCAL_MEM_FENCE);\n";
116 
117  // Loop over the values of a single tile
118  // by computing outer products ulongo the local accumulation registers acc
119  k << " for (ulong k=0; k<curTileK; k++){\n";
120  // Cache the values of Bsub in registers to save local memory lookups
121  k << k.decl<value_type>("Breg")<<"[BLOCK_SIZE];\n";
122  k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
123  k << " Breg[wn] = Bsub[k][get_local_id(1) + wn * numWorkers];\n";
124  k << " }\n";
125 
126  // Perform the computation
127  k << " for (ulong wm = 0; wm<BLOCK_SIZE; wm++){\n";
128  k << k.decl<value_type>("Areg") << "= Asub[k][get_local_id(0) + wm * numWorkers];\n";
129  k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
130  k << " acc[wm][wn] += Areg * Breg[wn];\n";
131  k << " }\n";
132  k << " }\n";
133  k << " }\n";
134 
135  // Synchronise before loading the next tile
136  k << " barrier(CLK_LOCAL_MEM_FENCE);\n";
137  k << "}\n";
138 
139  // Store the final results in C
140  k << "const ulong maxCi = min(TILE_SIZE, N - get_group_id(0) * TILE_SIZE);\n";
141  k << "const ulong maxCj = min(TILE_SIZE, N - get_group_id(1) * TILE_SIZE);\n";
142  k << "const ulong offTileCi = TILE_SIZE * get_group_id(0);\n";
143  k << "const ulong offTileCj = TILE_SIZE * get_group_id(1);\n";
144  k << "ulong wm = 0;\n";
145  k << "for (ulong i = get_local_id(0); i < maxCi; i += numWorkers, wm++){\n";
146  k << " ulong wn = 0;\n";
147  k << " for (ulong j =get_local_id(1); j < maxCj; j += numWorkers, wn++){\n";
148  k << " if(get_group_id(1) != get_group_id(0) || (upper && j >= i) || (!upper && j <= i) ){\n";
149  k << C()(k.expr<cl_ulong>("(offTileCi + i)"), k.expr<cl_ulong>("(offTileCj + j)")) << "+= alpha * acc[wm][wn];\n";
150  k << " }\n";
151  k << " }\n";
152  k << "}\n";
153 
154  boost::compute::kernel kernel = k.compile(C().queue().get_context(), options);
155 
156  //enqueue kernel with kernel args
157  kernel.set_arg(N_index, C().size1());
158  kernel.set_arg(K_index, A().size2());
159  kernel.set_arg(alpha_index, alpha);
160  kernel.set_arg(upper_index, (std::size_t)Upper);
161 
162  std::size_t global_work_size[2] = {
163  (C().size1()+TILE_SIZE-1)/ TILE_SIZE * NUM_WORKERS,
164  (C().size2()+TILE_SIZE-1)/ TILE_SIZE * NUM_WORKERS
165  };
166  std::size_t local_work_size[2] = {NUM_WORKERS, NUM_WORKERS};
167  C().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
168 }
169 
170 }}
171 
172 #endif