getrf.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Implements the default implementation of the getrf algorithm
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2014 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 #ifndef REMORA_KERNELS_DEFAULT_GETRF_HPP
31 #define REMORA_KERNELS_DEFAULT_GETRF_HPP
32 
33 #include "simple_proxies.hpp" //proxies for recursive blocking
34 #include "../trsm.hpp" //trsm kernel
35 #include "../gemm.hpp" //gemm kernel
36 #include "../../permutation.hpp" //pivoting
37 #include <vector>
38 
39 namespace remora{namespace bindings {
40 
41 //diagonal block kernels
42 template<class MatA, class VecP>
43 void getrf_block(
44  matrix_expression<MatA, cpu_tag>& A,
45  vector_expression<VecP, cpu_tag>& P,
46  column_major
47 ) {
48  for(std::size_t j = 0; j != A().size2(); ++j){
49  //search pivot
50  double pivot_value = A()(j,j);
51  P()(j) = j;
52  for(std::size_t i = j+1; i != A().size1(); ++i){
53  if(std::abs(A()(i,j)) > std::abs(pivot_value)){
54  P()(j) = i;
55  pivot_value = A()(i,j);
56  }
57  }
58  if(pivot_value == 0)
59  throw std::invalid_argument("[getrf] Matrix is rank deficient or numerically unstable");
60  //apply row pivoting if needed
61  if(std::size_t(P()(j)) != j){
62  A().swap_rows(j,P()(j));
63  }
64 
65  //by definition, L11= 1 and U11=pivot_value
66  //so we only need to transform the current column
67  //And can skip the current row
68  for(std::size_t i = j+1; i != A().size1(); ++i){
69  A()(i,j) /= pivot_value;
70  }
71 
72 
73  //but we have to apply the outer product to the
74  //lower right matrix
75  for(std::size_t k = j+1; k != A().size2(); ++k){
76  for(std::size_t i = j+1; i != A().size1(); ++i){
77  A()(i,k) -= A()(i,j) * A()(j,k);
78  }
79  }
80  }
81 }
82 
83 
84 template<class MatA, class VecP>
85 void getrf_block(
86  matrix_expression<MatA, cpu_tag>& A,
87  vector_expression<VecP, cpu_tag>& P,
88  row_major
89 ) {
90  //ther eis no way to do fast row pivoting on row-major format.
91  //so copy the block into column major format, perform the decomposition
92  // and copy back.
93  typedef typename MatA::value_type value_type;
94  std::vector<value_type> storage(A().size1() * A().size2());
95  dense_matrix_adaptor<value_type, column_major> colBlock(storage.data(), A().size1(), A().size2());
96  kernels::assign(colBlock, A);
97  getrf_block(colBlock, P, column_major());
98  kernels::assign(A, colBlock);
99 }
100 
101 //todo: row-major needs to copy the block in temporary storage
102 
103 //main kernel for large matrices
104 //we recursively split the matrix into panels along the columns
105 //until a panel is small enough. Then we perform the LU decomposition
106 //on that panel and update the remaining matrix accordingly.
107 // For a given blocking of A
108 // | A11 | A12 |
109 // A = | --------- |
110 // | A21 | A22 |
111 // where A11 and A22 are square matrices, the LU decomposition
112 // is computed recursively by applying the LU decomposition on block A11
113 // to obtain A11= L11*U11 where L is unit-lower triangular and U
114 // upper triangular.
115 // Then we compute
116 // A12<-L11^{-1}A21
117 // A21<-A21 U11^{-1}
118 // A22<- A22 - A21 * A12
119 // and perform the LU-decomposition on A22.
120 // in practice the LU decomposition requires a permutation P to be stable
121 // where in each iteration the best pivot along the current column is
122 // searched. This leads to a panel-wise computation where the
123 // computation of A11 and A21 is performed together in order
124 // to correctly apply the permutation.
125 // afterwards the permutation has to be applied to A12 and A22 before
126 // computing their blocks. When A22 is computed, it contains an additional permutation
127 // that has to be applied to A21 as well.
128 template <typename MatA, typename VecP>
129 void getrf_recursive(
130  matrix_expression<MatA, cpu_tag>& A,
131  vector_expression<VecP, cpu_tag>& P,
132  std::size_t start,
133  std::size_t end
134 ){
135  std::size_t block_size = 32;
136  std::size_t size = end-start;
137  std::size_t end1=A().size1();
138 
139  //if the matrix is small enough, call the computation kernel directly for the block
140  if(size <= block_size){
141  auto Ablock = simple_subrange(A, start, end1, start, end); //recursive getrf needs all columns
142  auto Pblock = simple_subrange(P, start, end);
143  getrf_block(Ablock,Pblock, typename MatA::orientation());
144  return;
145  }
146 
147  //otherwise run the kernel recursively
148  std::size_t numBlocks = (size + block_size - 1) / block_size;
149  std::size_t split = start + numBlocks/2 * block_size;
150  auto A_2 = simple_subrange(A, start, end1, split, end);
151  auto A11 = simple_subrange(A, start, split, start, split);
152  auto A12 = simple_subrange(A, start, split, split, end);
153  auto A21 = simple_subrange(A, split, end1, start, split);
154  auto A22 = simple_subrange(A, split, end1, split, end);
155  auto P1 = simple_subrange(P, start, split);
156  auto P2 = simple_subrange(P, split, end);
157 
158 
159  //run recursively on the first block
160  getrf_recursive(A, P, start, split);
161 
162  //block A21 is already transformed
163 
164  //apply permutation to block A12 and A22
165  swap_rows(P1, A_2);
166  // solve system in A12
167  kernels::trsm<unit_lower, left>(A11, A12);
168 
169  //update block A22
170  kernels::gemm(A21, A12, A22, -1);
171 
172  //call recursively getrf on A22
173  getrf_recursive(A, P, split, end);
174 
175  //permute A21 and update P2 to reflect the full matrix
176  swap_rows(P2, A21);
177  for(auto& p: P2){
178  p += split-start;
179  }
180 }
181 
182 template <typename MatA, typename VecP>
183 void getrf(
184  matrix_expression<MatA, cpu_tag>& A,
185  vector_expression<VecP, cpu_tag>& P
186 ){
187  for(std::size_t i = 0; i != P().size(); ++i){
188  P()(i) = i;
189  }
190  getrf_recursive(A, P, 0, A().size1());
191 }
192 }}
193 #endif