trmm.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief -
5  *
6  * \author O. Krause
7  * \date 2010
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_TRMM_HPP
32 #define REMORA_KERNELS_DEFAULT_TRMM_HPP
33 
34 #include "../../expression_types.hpp"//for matrix_expression
35 #include "../../detail/traits.hpp"//triangular_tag
36 #include "../../detail/matrix_proxy_classes.hpp"//for matrix_range/matrix_transpose
37 #include "simd.hpp"
38 #include "mgemm.hpp" //block macro kernel for dense syrk
39 #include <type_traits> //std::false_type marker for unoptimized, std::common_type
40 namespace remora{ namespace bindings {
41 
42 template <typename T>
43 struct trmm_block_size {
44  typedef detail::block<T> block;
45  static const unsigned mr = 4; // stripe width for lhs
46  static const unsigned nr = 3 * block::max_vector_elements; // stripe width for rhs
47  static const unsigned lhs_block_size = 32 * mr;
48  static const unsigned rhs_column_size = (1024 / nr) * nr;
49  static const unsigned align = 64; // align temporary arrays to this boundary
50 };
51 
52 template <class E, class T, class block_size, bool unit>
53 void pack_A_triangular(matrix_expression<E, cpu_tag> const& A, T* p, block_size, triangular_tag<false,unit>)
54 {
55  BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
56 
57  std::size_t const mc = A().size1();
58  std::size_t const kc = A().size2();
59  static std::size_t const MR = block_size::mr;
60  const std::size_t mp = (mc+MR-1) / MR;
61 
62  std::size_t nu = 0;
63  for (std::size_t l=0; l<mp; ++l) {
64  for (std::size_t j=0; j<kc; ++j) {
65  for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
66  if(unit && i == j)
67  p[nu] = 1.0;
68  else
69  p[nu] = ((i<mc) && (i >= j)) ? A()(i,j) : T(0);
70  }
71  }
72  }
73 }
74 
75 template <class E, class T, class block_size, bool unit>
76 void pack_A_triangular(matrix_expression<E, cpu_tag> const& A, T* p, block_size, triangular_tag<true,unit>)
77 {
78  BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
79 
80  std::size_t const mc = A().size1();
81  std::size_t const kc = A().size2();
82  static std::size_t const MR = block_size::mr;
83  const std::size_t mp = (mc+MR-1) / MR;
84 
85  std::size_t nu = 0;
86  for (std::size_t l=0; l<mp; ++l) {
87  for (std::size_t j=0; j<kc; ++j) {
88  for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
89  if(unit && i == j)
90  p[nu] = 1.0;
91  else
92  p[nu] = ((i<mc) && (i <= j)) ? A()(i,j) : T(0);
93  }
94  }
95  }
96 }
97 
98 
99 template <class E1, class E2, class Triangular>
100 void trmm_impl(
101  matrix_expression<E1, cpu_tag> const& e1,
102  matrix_expression<E2, cpu_tag> & e2,
103  Triangular t
104 ){
105  typedef typename std::common_type<typename E1::value_type, typename E2::value_type>::type value_type;
106  typedef trmm_block_size<value_type> block_size;
107 
108  static const std::size_t AC = block_size::lhs_block_size;
109  static const std::size_t BC = block_size::rhs_column_size;
110 
111  //obtain uninitialized aligned storage
112  boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
113  value_type* A = allocator.allocate(AC * AC);
114  value_type* B = allocator.allocate(AC * BC);
115 
116  //figure out number of blocks to use
117  const std::size_t M = e1().size1();
118  const std::size_t N = e2().size2();
119  const std::size_t Ab = (M+AC-1) / AC;
120  const std::size_t Bb = (N+BC-1) / BC;
121 
122  //get access to raw storage of B
123  auto storageB = e2().raw_storage();
124  auto Bpointer = storageB.values;
125  const std::size_t stride1 = E2::orientation::index_M(storageB.leading_dimension,1);
126  const std::size_t stride2 = E2::orientation::index_m(storageB.leading_dimension,1);
127 
128  for (std::size_t j = 0; j < Bb; ++j) {//columns of B
129  std::size_t nc = std::min(BC, N - j * BC);
130  //We have to make use of the triangular nature of B and the fact that rhs and storage are the same matrix
131  //for upper triangular matrices we can compute whole columns of A starting from the first
132  //until we reach the block on the diagonal, where we stop.
133  //for lower triangular matrices we start with the last column instead.
134  for (std::size_t k0 = 0; k0< Ab; ++k0){//row-blocks of B = column blocks of A.
135  std::size_t k = Triangular::is_upper? k0: Ab-k0-1;
136  std::size_t kc = std::min(AC, M - k * AC);
137  //read block of B
138  matrix_range<E2> Bs(e2(), k*AC, k*AC + kc, j * BC, j * BC + nc );
139  pack_B_dense(Bs, B, block_size());
140  Bs.clear();//its going to be overwritten with the result
141 
142  //apply block of B to all blocks of A needing it. we will overwrite the real B block as a result of this.
143  //this is okay as the block is stored as argument
144  std::size_t i_start = Triangular::is_upper? 0: k;
145  std::size_t i_end = Triangular::is_upper? k+1: Ab;
146  for (std::size_t i = i_start; i < i_end; ++i) {
147  std::size_t mc = std::min(AC, M - i * AC);
148  matrix_range<typename const_expression<E1>::type> As(e1(), i * AC, i * AC + mc, k * AC, k * AC + kc);
149  if(i == k){
150  pack_A_triangular(As, A, block_size(), t);
151  }else
152  pack_A_dense(As, A, block_size());
153 
154  mgemm(
155  mc, nc, kc, value_type(1.0), A, B,
156  &Bpointer[i*AC * stride1 + j*BC * stride2], stride1, stride2, block_size()
157  );
158  }
159  }
160  }
161  //free storage
162  allocator.deallocate(A,AC * AC);
163  allocator.deallocate(B,AC * BC);
164 }
165 
166 
167 
168 //main kernel runs the kernel above recursively and calls gemv
169 template <bool Upper,bool Unit,typename MatA, typename MatB>
170 void trmm(
171  matrix_expression<MatA, cpu_tag> const& A,
172  matrix_expression<MatB, cpu_tag>& B,
173  std::false_type //unoptimized
174 ){
175  REMORA_SIZE_CHECK(A().size1() == A().size2());
176  REMORA_SIZE_CHECK(A().size2() == B().size1());
177 
178  trmm_impl(A,B, triangular_tag<Upper,Unit>());
179 }
180 
181 }}
182 
183 #endif