dense_gemm.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief dense matrix matrix multiplication implementation
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
32 #define REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
33 
34 #include "../gemv.hpp"//for dispatching to gemv
35 #include "../../assignment.hpp"//plus_assign
36 #include "../../detail/matrix_proxy_classes.hpp"//matrix row,column,transpose,range
37 #include "mgemm.hpp" //block macro kernel for dense gemm
38 #include <type_traits> //std::common_type
39 
40 
41 namespace remora{namespace bindings {
42 
43 // Dense Block-GEMM implementation based on boost.ublas
44 // written by:
45 // Copyright (c) 2016
46 // Michael Lehn, Imre Palik
47 //
48 // Distributed under the Boost Software License, Version 1.0. (See
49 // accompanying file LICENSE_1_0.txt or copy at
50 // http://www.boost.org/LICENSE_1_0.txt)
51 
52 template <typename T>
53 struct gemm_block_size {
54  typedef detail::block<T> block;
55  static const unsigned mr = 4; // stripe width for lhs
56  static const unsigned nr = 3 * block::max_vector_elements; // stripe width for rhs
57  static const unsigned mc = 128;
58  static const unsigned kc = 512; // stripe length
59  static const unsigned nc = (1024/nr) * nr;
60 };
61 
62 template <>
63 struct gemm_block_size<float> {
64  typedef detail::block<float> block;
65  static const unsigned mc = 256;
66  static const unsigned kc = 512; // stripe length
67  static const unsigned nc = 4096;
68  static const unsigned mr = 4; // stripe width for lhs
69  static const unsigned nr = 16; // stripe width for rhs
70 };
71 
72 template <>
73 struct gemm_block_size<long double> {
74  typedef detail::block<long double> block;
75  static const unsigned mc = 256;
76  static const unsigned kc = 512; // stripe length
77  static const unsigned nc = 4096;
78  static const unsigned mr = 1; // stripe width for lhs
79  static const unsigned nr = 4; // stripe width for rhs
80 };
81 
82 //-- Dense gemm
83 template <class E1, class E2, class Mat>
84 void dense_gemm(
85  matrix_expression<E1, cpu_tag> const& e1,
86  matrix_expression<E2, cpu_tag> const& e2,
87  matrix_expression<Mat, cpu_tag>& m,
88  typename Mat::value_type alpha
89 ){
90  static_assert(std::is_same<typename Mat::orientation,row_major>::value,"target matrix must be row major");
91  typedef typename std::common_type<
92  typename E1::value_type, typename E2::value_type, typename Mat::value_type
93  >::type value_type;
94 
95  typedef gemm_block_size<
96  typename std::common_type<typename E1::value_type, typename E2::value_type>::type
97  > block_size;
98 
99  static const std::size_t MC = block_size::mc;
100  static const std::size_t NC = block_size::nc;
101  static const std::size_t KC = block_size::kc;
102 
103  //obtain uninitialized aligned storage
104  boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
105  value_type* A = allocator.allocate(MC * KC);
106  value_type* B = allocator.allocate(NC * KC);
107 
108  const std::size_t M = m().size1();
109  const std::size_t N = m().size2();
110  const std::size_t K = e1().size2 ();
111  const std::size_t mb = (M+MC-1) / MC;
112  const std::size_t nb = (N+NC-1) / NC;
113  const std::size_t kb = (K+KC-1) / KC;
114 
115  auto storageM = m().raw_storage();
116  auto C_ = storageM.values;
117  const std::size_t ldc = storageM.leading_dimension;
118  for (std::size_t j=0; j<nb; ++j) {
119  std::size_t nc = std::min(NC, N - j*NC);
120 
121  for (std::size_t l=0; l<kb; ++l) {
122  std::size_t kc = std::min(KC, K - l*KC);
123  matrix_range<typename const_expression<E2>::type> Bs(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc);
124  pack_B_dense(Bs, B, block_size());
125 
126  for (std::size_t i=0; i<mb; ++i) {
127  std::size_t mc = std::min(MC, M - i*MC);
128  matrix_range<typename const_expression<E1>::type> As(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
129  pack_A_dense(As, A, block_size());
130 
131  mgemm(
132  mc, nc, kc, alpha, A, B,
133  &C_[i*MC*ldc+j*NC], ldc , 1, block_size()
134  );
135  }
136  }
137  }
138  //free storage
139  allocator.deallocate(A,MC * KC);
140  allocator.deallocate(B,NC * KC);
141 }
142 
143 }}
144 #endif