syrk.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief -
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_SYRK_HPP
32 #define REMORA_KERNELS_DEFAULT_SYRK_HPP
33 
34 #include "../../expression_types.hpp"//for matrix_expression
35 #include "../../detail/matrix_proxy_classes.hpp"//for matrix_range/matrix_transpose
36 #include "mgemm.hpp" //block macro kernel for dense syrk
37 #include <type_traits> //std::false_type marker for unoptimized, std::common_Type
38 
39 namespace remora { namespace bindings {
40 
41 
42 template <typename T>
43 struct syrk_block_size {
44  typedef detail::block<T> block;
45  static const unsigned mr = 4; // stripe width for E_left
46  static const unsigned nr = mr * block::max_vector_elements; // stripe width for E_right
47  static const unsigned lhs_block_size = 3 * mr * nr;//square block size of M to compute
48  static const unsigned rhs_k_size = 1024;//strip of ks to compute
49 };
50 template <class E, class Mat, class Triangular>
51 void syrk_impl(
52  matrix_expression<E, cpu_tag> const& e,
53  matrix_expression<Mat, cpu_tag>& m,
54  typename Mat::value_type& alpha,
55  Triangular t
56 ){
57  typedef typename E::value_type value_type;
58  typedef syrk_block_size<value_type> block_size;
59 
60  static const std::size_t MC = block_size::lhs_block_size;
61  static const std::size_t EC = block_size::rhs_k_size;
62 
63  //obtain uninitialized aligned storage
64  boost::alignment::aligned_allocator<value_type,block_size::block::align> allocatorE;
65  boost::alignment::aligned_allocator<typename Mat::value_type,block_size::block::align> allocatorM;
66  value_type* E_left = allocatorE.allocate(MC * EC);
67  value_type* E_right = allocatorE.allocate(MC * EC);
68  auto M_diagonal_block = allocatorM.allocate(MC * MC);
69 
70  //figure out number of blocks to use
71  const std::size_t M = e().size1();
72  const std::size_t K = e().size2();
73  const std::size_t Mb = (M+MC-1) / MC;//we split m in Mb x Mb blocks
74  const std::size_t Eb = (K+EC-1) / EC;//we split B in Mb x Eb blocks
75 
76  //get access to raw storage of M
77  auto storageM = m().raw_storage();
78  auto Mpointer = storageM.values;
79  const std::size_t stride1 = Mat::orientation::index_M(storageM.leading_dimension,1);
80  const std::size_t stride2 = Mat::orientation::index_m(storageM.leading_dimension,1);
81 
82  for (std::size_t k = 0; k < Eb; ++k) {//column blocks of E
83  std::size_t kc = std::min(EC, K - k * EC);
84  for (std::size_t i = 0; i < Mb; ++i){//row-blocks of M
85  std::size_t mc = std::min(MC, M - i * MC);
86  //load block of the left E into memory
87  matrix_range<E const> E_lefts(e(), i * MC, i * MC + mc, k*EC, k*EC + kc );
88  pack_A_dense(E_lefts, E_left, block_size());
89 
90  std::size_t start_j = Triangular::is_upper? i : 0;
91  std::size_t end_j = Triangular::is_upper? Mb : i+1;
92  for(std::size_t j = start_j; j < end_j; ++j){//traverse over the blocks that are to be computed
93  std::size_t mc2 = std::min(MC, M - j * MC);
94  //load block of the right E into memory
95  matrix_range<typename const_expression<E>::type> E_rights(e(), j * MC, j * MC + mc2, k*EC, k*EC + kc );
96  matrix_transpose<matrix_range<typename const_expression<E>::type> > E_rights_trans(E_rights);
97  pack_B_dense(E_rights_trans, E_right, block_size());
98 
99  if(i==j){//diagonal block: we have to ensure that we only access elements on the diagonal
100  for(std::size_t i0 = 0; i0 != mc; ++i0){
101  for(std::size_t j0 = 0; j0 != mc2; ++j0){
102  M_diagonal_block[i0*MC+j0] = 0.0;
103  }
104  }
105  mgemm(
106  mc, mc2, kc, alpha, E_left, E_right,
107  M_diagonal_block, MC, 1, block_size()
108  );
109  auto M_diagonal = Mpointer + i * MC * stride1 + j * MC * stride2;
110  for(std::size_t i0 = 0; i0 != mc; ++i0){
111  std::size_t start_j0 = Triangular::is_upper? i0 : 0;
112  std::size_t end_j0 = Triangular::is_upper? mc2 : i0+1;
113  for(std::size_t j0 = start_j0; j0 < end_j0; ++j0){
114  M_diagonal[i0*stride1+j0*stride2] += M_diagonal_block[i0*MC+j0];
115  }
116  }
117  }else{
118  mgemm(
119  mc, mc2, kc, alpha, E_left, E_right,
120  &Mpointer[i*MC * stride1 + j*MC * stride2], stride1, stride2, block_size()
121  );
122  }
123  }
124  }
125  }
126  //free storage
127  allocatorE.deallocate(E_left,MC * EC);
128  allocatorE.deallocate(E_right,MC * EC);
129  allocatorM.deallocate(M_diagonal_block, MC * MC);
130 }
131 
132 
133 
134 //main kernel runs the kernel above recursively and calls gemv
135 template <bool Upper, typename M, typename E>
136 void syrk(
137  matrix_expression<E, cpu_tag> const& e,
138  matrix_expression<M, cpu_tag>& m,
139  typename M::value_type& alpha,
140  std::false_type //unoptimized
141 ){
142  REMORA_SIZE_CHECK(m().size1() == m().size2());
143  REMORA_SIZE_CHECK(m().size2() == e().size1());
144 
145  syrk_impl(e,m, alpha, triangular_tag<Upper,false>());
146 }
147 
148 }}
149 
150 #endif