trsm.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief -
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_TRSM_HPP
32 #define REMORA_KERNELS_DEFAULT_TRSM_HPP
33 
34 #include "../../expression_types.hpp" //matrix_expression
35 #include "../../detail/structure.hpp" //structure tags
36 #include <stdexcept> //exception when matrix is singular
37 #include "../gemm.hpp" //gemm kernel
38 #include "simple_proxies.hpp"
39 #include <type_traits> //std::false_type marker for unoptimized
40 
41 namespace remora{namespace bindings {
42 
43 //Lower triangular - matrix(row-major)
44 template<std::size_t maxBlockSize1,std::size_t maxBlockSize2, bool Unit, class MatA, class MatB>
45 void trsm_block(
46  matrix_expression<MatA, cpu_tag> const& A,
47  matrix_expression<MatB, cpu_tag> &B,
48  lower,
49  row_major // B is row-major
50 ){
51  REMORA_SIZE_CHECK(A().size1() <= maxBlockSize1);
52  typedef typename MatA::value_type value_typeA;
53  typedef typename MatB::value_type value_typeB;
54 
55 
56  //evaluate and copy block of A
57  std::size_t size = A().size1();
58  value_typeA blockA[maxBlockSize1][maxBlockSize1];
59  for(std::size_t i = 0; i != size; ++i){
60  for(std::size_t j = 0; j <= i; ++j){
61  blockA[i][j] = A()(i,j);
62  }
63  }
64 
65 
66  value_typeB blockB[maxBlockSize2][maxBlockSize1];
67  std::size_t numBlocks = (B().size2()+maxBlockSize2-1)/maxBlockSize2;
68  for(std::size_t i = 0; i != numBlocks; ++i){
69  std::size_t startB= i*maxBlockSize2;
70  std::size_t curBlockSize2 =std::min(maxBlockSize2, B().size2() - startB);
71 
72  //copy blockB transposed in memory
73  for(std::size_t i = 0; i != size; ++i){
74  for(std::size_t k = 0; k != curBlockSize2; ++k){
75  blockB[k][i] = B()(i,startB+k);
76  }
77  }
78  //compute trsv kernel for each row in blockB
79  for(std::size_t k = 0; k != curBlockSize2; ++k){
80  for (std::size_t i = 0; i != size; ++i) {
81  for (std::size_t j = 0; j != i; ++j) {
82  blockB[k][i] -= blockA[i][j]*blockB[k][j];
83  }
84  if(!Unit){
85  if(blockA[i][i] == value_typeA())
86  throw std::invalid_argument("[TRSM] Matrix is singular!");
87  blockB[k][i] /= blockA[i][i];
88  }
89  }
90  }
91  //copy blockB back
92  for(std::size_t i = 0; i != size; ++i){
93  for(std::size_t k = 0; k != curBlockSize2; ++k){
94  B()(i,startB+k) = blockB[k][i];
95  }
96  }
97  }
98 }
99 
100 // Lower triangular - matrix(column-major)
101 template<std::size_t maxBlockSize1,std::size_t maxBlockSize2, bool Unit, class MatA, class MatB>
102 void trsm_block(
103  matrix_expression<MatA, cpu_tag> const& A,
104  matrix_expression<MatB, cpu_tag>& B,
105  lower,
106  column_major // B is column-major
107 ) {
108  typedef typename MatA::value_type value_type;
109  //evaluate and copy block of A
110  std::size_t size = A().size1();
111  value_type blockA[maxBlockSize1][maxBlockSize1];
112  for(std::size_t i = 0; i != size; ++i){
113  for(std::size_t j = 0; j <= i; ++j){
114  blockA[i][j] = A()(i,j);
115  }
116  }
117 
118  //compute trsv kernel for each column in B
119  for(std::size_t k = 0; k != B().size2(); ++k){
120  for (std::size_t i = 0; i != size; ++i) {
121  for (std::size_t j = 0; j != i; ++j) {
122  B()(i,k) -= blockA[i][j] * B()(j,k);
123  }
124  if(!Unit){
125  if(blockA[i][i] == value_type())
126  throw std::invalid_argument("[TRSM] Matrix is singular!");
127  B()(i,k) /= blockA[i][i];
128  }
129  }
130  }
131 }
132 
133 
134 //Upper triangular - matrix(row-major)
135 template<std::size_t maxBlockSize1, std::size_t maxBlockSize2, bool Unit, class MatA, class MatB>
136 void trsm_block(
137  matrix_expression<MatA, cpu_tag> const& A,
138  matrix_expression<MatB, cpu_tag> &B,
139  upper,
140  row_major // B is row-major
141 ){
142  REMORA_SIZE_CHECK(A().size1() <= maxBlockSize1);
143  typedef typename MatA::value_type value_typeA;
144  typedef typename MatB::value_type value_typeB;
145 
146  //evaluate and copy block of A
147  std::size_t size = A().size1();
148  value_typeA blockA[maxBlockSize1][maxBlockSize1];
149  for(std::size_t i = 0; i != size; ++i){
150  for(std::size_t j = i; j != size; ++j){
151  blockA[i][j] = A()(i,j);
152  }
153  }
154 
155  value_typeB blockB[maxBlockSize2][maxBlockSize1];
156  std::size_t numBlocks = (B().size2()+maxBlockSize2-1)/maxBlockSize2;
157  for(std::size_t i = 0; i != numBlocks; ++i){
158  std::size_t startB= i*maxBlockSize2;
159  std::size_t curBlockSize2 =std::min(maxBlockSize2, B().size2() - startB);
160 
161  //copy blockB transposed in memory
162  for(std::size_t i = 0; i != size; ++i){
163  for(std::size_t k = 0; k != curBlockSize2; ++k){
164  blockB[k][i] = B()(i,startB+k);
165  }
166  }
167  //compute trsv kernel for each row in blockB
168  for(std::size_t k = 0; k != curBlockSize2; ++k){
169  for (std::size_t n = 0; n != size; ++n) {
170  std::size_t i = size-n-1;
171  for (std::size_t j = i+1; j != size; ++j) {
172  blockB[k][i] -= blockA[i][j] * blockB[k][j];
173  }
174  if(!Unit){
175  if(blockA[i][i] == value_typeA())
176  throw std::invalid_argument("[TRSM] Matrix is singular!");
177  blockB[k][i] /= blockA[i][i];
178  }
179  }
180  }
181  //copy blockB back
182  for(std::size_t i = 0; i != size; ++i){
183  for(std::size_t j = 0; j != curBlockSize2; ++j){
184  B()(i,startB+j) = blockB[j][i];
185  }
186  }
187  }
188 }
189 
190 // Upper triangular - matrix(column-major)
191 template<std::size_t maxBlockSize1,std::size_t maxBlockSize2, bool Unit, class MatA, class MatB>
192 void trsm_block(
193  matrix_expression<MatA, cpu_tag> const& A,
194  matrix_expression<MatB, cpu_tag>& B,
195  upper,
196  column_major // B is column-major
197 ) {
198  typedef typename MatA::value_type value_type;
199  //evaluate and copy block of A
200  std::size_t size = A().size1();
201  value_type blockA[maxBlockSize1][maxBlockSize1];
202  for(std::size_t i = 0; i != size; ++i){
203  for(std::size_t j = i; j != size; ++j){
204  blockA[i][j] = A()(i,j);
205  }
206  }
207 
208  //compute trsv kernel for each column in B
209  for(std::size_t k = 0; k != B().size2(); ++k){
210  for (std::size_t n = 0; n != size; ++n) {
211  std::size_t i = size-n-1;
212  for (std::size_t j = i+1; j != size; ++j) {
213  B()(i,k) -= blockA[i][j] * B()(j,k);
214  }
215  if(!Unit){
216  if(blockA[i][i] == value_type())
217  throw std::invalid_argument("[TRSM] Matrix is singular!");
218  B()(i,k) /= blockA[i][i];
219  }
220  }
221  }
222 }
223 
224 template <typename MatA, typename MatB, class Triangular>
225 void trsm_recursive(
226  matrix_expression<MatA, cpu_tag> const& Afull,
227  matrix_expression<MatB, cpu_tag> & Bfull,
228  std::size_t start,
229  std::size_t end,
230  Triangular t,
231  left l
232 ){
233  static std::size_t const Block_Size = 32;
234  std::size_t num_rhs = Bfull().size2();
235  auto A = simple_subrange(Afull,start,end,start,end);
236  auto B = simple_subrange(Bfull,start,end,0,num_rhs);
237  //if the matrix is small enough call the computation kernel directly for the block
238  if(A.size1() <= Block_Size){
239  trsm_block<Block_Size,16,Triangular::is_unit>(A,B,triangular_tag<Triangular::is_upper,false>(), typename MatB::orientation());
240  return;
241  }
242  std::size_t size = A.size1();
243  std::size_t numBlocks =(A.size1()+Block_Size-1)/Block_Size;
244  std::size_t split = numBlocks/2*Block_Size;
245  auto Bfront = simple_subrange(B,0,split,0,num_rhs);
246  auto Bback = simple_subrange(B,split,size,0,num_rhs);
247 
248  //otherwise run the kernel recursively
249  if(Triangular::is_upper){ //Upper triangular case
250  trsm_recursive(Afull, Bfull,start+split,end, t, l);
251  kernels::gemm(simple_subrange(A,0,split,split,size), Bback, Bfront, -1.0);
252  trsm_recursive(Afull, Bfull,start,start+split, t, l);
253  }else{// Lower triangular caste
254  trsm_recursive(Afull, Bfull,start,start+split, t, l);
255  kernels::gemm(simple_subrange(A,split,size,0,split), Bfront, Bback, -1.0);
256  trsm_recursive(Afull, Bfull,start+split,end, t, l);
257  }
258 }
259 
260 template <typename MatA, typename MatB, class Triangular>
261 void trsm_recursive(
262  matrix_expression<MatA, cpu_tag> const& Afull,
263  matrix_expression<MatB, cpu_tag> & Bfull,
264  std::size_t start,
265  std::size_t end,
266  Triangular,
267  right
268 ){
269  auto transA = simple_trans(Afull);
270  auto transB = simple_trans(Bfull);
271  trsm_recursive(transA,transB,start,end,typename Triangular::transposed_orientation(),left());
272 }
273 
274 //main kernel runs the kernel above recursively and calls gemv
275 template <class Triangular, class Side, typename MatA, typename MatB>
276 void trsm(
277  matrix_expression<MatA, cpu_tag> const& A,
278  matrix_expression<MatB, cpu_tag>& B,
279  std::false_type //unoptimized
280 ){
281 
282  bindings::trsm_recursive(A,B,0,A().size1(), Triangular(), Side());
283 }
284 }}
285 #endif