mgemm.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief The mgemm macro kernel used for implementing gemm
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_MGEMM_HPP
32 #define REMORA_KERNELS_DEFAULT_MGEMM_HPP
33 
34 #include "simd.hpp"
35 #include <algorithm>//std::fill
36 
37 
38 namespace remora{namespace bindings {
39 
40 // Block-GEMM implementation based on boost.ublas
41 // written by:
42 // Copyright (c) 2016
43 // Michael Lehn, Imre Palik
44 //
45 // Distributed under the Boost Software License, Version 1.0. (See
46 // accompanying file LICENSE_1_0.txt or copy at
47 // http://www.boost.org/LICENSE_1_0.txt)
48 
49 //-- Micro Kernel For Dense operations----------------------------------------------------------
50 template <class block_size, class T, class TC>
51 void ugemm(
52  std::size_t kc, TC alpha, T const* A, T const* B,
53  TC* C, std::size_t stride1, std::size_t stride2
54 ){
55  BOOST_ALIGN_ASSUME_ALIGNED(A, block_size::block::align);
56  BOOST_ALIGN_ASSUME_ALIGNED(B, block_size::block::align);
57 
58  typedef typename block_size::block::type vx;
59  static const std::size_t vector_length = block_size::block::vector_elements;
60  static const std::size_t vecNR = block_size::nr/vector_length;
61 #ifdef REMORA_USE_SIMD
62  vx P[block_size::mr * vecNR] = {};
63 #else
64  typename std::aligned_storage<sizeof(vx[block_size::mr*vecNR]),block_size::block::align>::type Pa;
65  T* P = reinterpret_cast<T*>(&Pa);
66  for (std::size_t c = 0; c < block_size::mr*vecNR; c++)
67  P[c] = 0;
68 #endif
69 
70 
71  // perform the matrix-matrix product as outer product
72  // of rows of A and B
73  vx const* b = (vx const*)B;
74  for (std::size_t l=0; l<kc; ++l) {
75  for (std::size_t i=0; i<block_size::mr; ++i) {
76  for (std::size_t j=0; j<vecNR; ++j) {
77  P[i * vecNR+j] += A[i]*b[j];
78  }
79  }
80  A += block_size::mr;
81  b += vecNR;
82  }
83  //multiply with alpha if necessary
84  if (alpha!=TC(1)) {
85  for (std::size_t i=0; i<block_size::mr; ++i) {
86  for (std::size_t j=0; j< vecNR; ++j) {
87  P[i*vecNR+j] *= alpha;
88  }
89  }
90  }
91 
92  //add result to C
93  T const* p = (T const*) P;
94  for (std::size_t i=0; i<block_size::mr; ++i) {
95  for (std::size_t j=0; j<block_size::nr; ++j) {
96  C[i * stride1+j * stride2] += p[i*block_size::nr+j];
97  }
98  }
99 }
100 
101 
102 // Macro Kernel for two densly packed Blocks
103 template <class T, class TC, class block_size>
104 void mgemm(
105  std::size_t mc, std::size_t nc, std::size_t kc, TC alpha,
106  T const* A, T const* B, TC *C,
107  std::size_t stride1, std::size_t stride2, block_size
108 ){
109  static std::size_t const MR = block_size::mr;
110  static std::size_t const NR = block_size::nr;
111  std::size_t const mp = (mc+MR-1) / MR;
112  std::size_t const np = (nc+NR-1) / NR;
113 
114  for (std::size_t j=0; j<np; ++j) {
115  std::size_t const nr = std::min(NR, nc - j*NR);
116 
117  for (std::size_t i=0; i<mp; ++i) {
118  std::size_t const mr = std::min(MR, mc - i*MR);
119  auto CBlockStart = C+i*MR*stride1+j*NR*stride2;
120  if (mr==MR && nr==NR) {
121  ugemm<block_size>(
122  kc, alpha,
123  &A[i*kc*MR], &B[j*kc*NR],
124  CBlockStart, stride1, stride2
125  );
126  } else {
127  TC CTempBlock[MR*NR];
128  std::fill_n(CTempBlock, MR*NR, T(0));
129  ugemm<block_size>(
130  kc, alpha,
131  &A[i*kc*MR], &B[j*kc*NR],
132  CTempBlock, NR, 1
133  );
134 
135  for (std::size_t i0=0; i0<mr; ++i0){
136  for (std::size_t j0=0; j0<nr; ++j0) {
137  CBlockStart[i0*stride1+j0 * stride2] += CTempBlock[i0*NR+j0];
138  }
139  }
140  }
141  }
142  }
143 }
144 
145 
146 //-- Packing blocks ------------------------------------------------------------
147 template <class E, class T, class block_size>
148 void pack_A_dense(matrix_expression<E, cpu_tag> const& A, T* p, block_size)
149 {
150  BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
151 
152  std::size_t const mc = A().size1();
153  std::size_t const kc = A().size2();
154  static std::size_t const MR = block_size::mr;
155  const std::size_t mp = (mc+MR-1) / MR;
156 
157  std::size_t nu = 0;
158  for (std::size_t l=0; l<mp; ++l) {
159  for (std::size_t j=0; j<kc; ++j) {
160  for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
161  p[nu] = (i<mc) ? A()(i,j) : T(0);
162  }
163  }
164  }
165 }
166 
167 
168 template <class E, class T, class block_size>
169 void pack_B_dense(matrix_expression<E, cpu_tag> const& B, T* p, block_size)
170 {
171  BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
172 
173  std::size_t const kc = B ().size1();
174  std::size_t const nc = B ().size2();
175  static std::size_t const NR = block_size::nr;
176  std::size_t const np = (nc+NR-1) / NR;
177 
178  std::size_t nu = 0;
179  for (std::size_t l=0; l<np; ++l) {
180  for (std::size_t i=0; i<kc; ++i) {
181  for (std::size_t j = l*NR; j < l*NR + NR; ++j,++nu){
182  p[nu] = (j<nc) ? B()(i,j) : T(0);
183  }
184  }
185  }
186 }
187 
188 }}
189 
190 #endif