include/shark/LinAlg/BLAS/kernels/default/getrf.hpp Source File

Go to the documentation of this file.
 /*!
  *
  *
  * \brief       Implements the default implementation of the getrf algorithm
  *
  * \author    O. Krause
  * \date        2016
  *
  *
  * \par Copyright 1995-2014 Shark Development Team
  *
  * <BR><HR>
  * This file is part of Shark.
  * <http://image.diku.dk/shark/>
  *
  * Shark is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * Shark is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
  *
  */
 #ifndef REMORA_KERNELS_DEFAULT_GETRF_HPP
 #define REMORA_KERNELS_DEFAULT_GETRF_HPP
 
 #include "simple_proxies.hpp" //proxies for recursive blocking
 #include "../trsm.hpp" //trsm kernel
 #include "../gemm.hpp" //gemm kernel
 #include "../../permutation.hpp" //pivoting
 #include <vector>
 
 namespace remora{namespace bindings {
 
 //diagonal block kernels
 template<class MatA, class VecP>
 void getrf_block(
     matrix_expression<MatA, cpu_tag>& A,
     vector_expression<VecP, cpu_tag>& P,
     column_major
 ) {
     for(std::size_t j = 0; j != A().size2(); ++j){
         //search pivot
         double pivot_value = A()(j,j);
         P()(j) = j;
         for(std::size_t i = j+1; i != A().size1(); ++i){
             if(std::abs(A()(i,j)) > std::abs(pivot_value)){
                 P()(j) = i;
                 pivot_value = A()(i,j);
             }
         }
         if(pivot_value == 0)
             throw std::invalid_argument("[getrf] Matrix is rank deficient or numerically unstable");
         //apply row pivoting if needed
         if(std::size_t(P()(j)) != j){
             A().swap_rows(j,P()(j));
         }
                 
         //by definition, L11= 1 and U11=pivot_value
         //so we only need to transform the current column
         //And can skip the current row
         for(std::size_t i = j+1; i != A().size1(); ++i){
             A()(i,j) /= pivot_value;
         }
         
         
         //but we have to apply the outer product to the
         //lower right matrix
         for(std::size_t k = j+1; k != A().size2(); ++k){
             for(std::size_t i = j+1; i != A().size1(); ++i){
                 A()(i,k) -= A()(i,j) * A()(j,k);
             }
         }
     }
 }
 
 
 template<class MatA, class VecP>
 void getrf_block(
     matrix_expression<MatA, cpu_tag>& A,
     vector_expression<VecP, cpu_tag>& P,
     row_major
 ) {
     //ther eis no way to do fast row pivoting on row-major format.
     //so copy the block into column major format, perform the decomposition
     // and copy back.
     typedef typename MatA::value_type value_type;
     std::vector<value_type> storage(A().size1() * A().size2());
     dense_matrix_adaptor<value_type, column_major> colBlock(storage.data(), A().size1(), A().size2());
     kernels::assign(colBlock, A);
     getrf_block(colBlock, P, column_major());
     kernels::assign(A, colBlock);
 }
 
 //todo: row-major needs to copy the block in temporary storage
 
 //main kernel for large matrices
 //we recursively split the matrix into panels along the columns
 //until a panel is small enough. Then we perform the LU decomposition
 //on that panel and update the remaining matrix accordingly.
 // For a given blocking of A
 //     | A11 | A12 |
 // A = | --------- |
 //     | A21 | A22 |
 // where A11 and A22 are square matrices, the LU decomposition
 // is computed recursively by applying the LU decomposition on block A11
 // to obtain A11= L11*U11 where L is unit-lower triangular and U 
 // upper triangular.
 // Then we compute 
 // A12<-L11^{-1}A21
 // A21<-A21 U11^{-1}
 // A22<- A22 - A21 * A12
 // and perform the LU-decomposition on A22.
 // in practice the LU decomposition requires a permutation P to be stable
 // where in each iteration the best pivot along the current column is
 // searched. This leads to a panel-wise computation where the
 // computation of A11 and A21 is performed together in order
 // to correctly apply the permutation.
 // afterwards the permutation has to be applied to A12 and A22 before
 // computing their blocks. When A22 is computed, it contains an additional permutation
 // that has to be applied to A21 as well.
 template <typename MatA, typename VecP>
 void getrf_recursive(
     matrix_expression<MatA, cpu_tag>& A,
     vector_expression<VecP, cpu_tag>& P,
     std::size_t start,
     std::size_t end
 ){
     std::size_t block_size = 32;
     std::size_t size = end-start;
     std::size_t end1=A().size1();
     
     //if the matrix is small enough, call the computation kernel directly for the block
     if(size <= block_size){
         auto Ablock = simple_subrange(A, start, end1, start, end); //recursive getrf needs all columns
         auto Pblock = simple_subrange(P, start, end);
         getrf_block(Ablock,Pblock, typename MatA::orientation());
         return;
     }
     
     //otherwise run the kernel recursively
     std::size_t numBlocks = (size + block_size - 1) / block_size;
     std::size_t split = start + numBlocks/2 * block_size;
     auto A_2 = simple_subrange(A, start, end1, split, end);
     auto A11 = simple_subrange(A, start, split, start, split);
     auto A12 = simple_subrange(A, start, split, split, end);
     auto A21 = simple_subrange(A, split, end1, start, split);
     auto A22 = simple_subrange(A, split, end1, split, end);
     auto P1 = simple_subrange(P, start, split);
     auto P2 = simple_subrange(P, split, end);
  
     
     //run recursively on the first block
     getrf_recursive(A, P, start, split);
     
     //block A21 is already transformed
     
     //apply permutation to block A12 and A22 
     swap_rows(P1, A_2);
     // solve system in A12
     kernels::trsm<unit_lower, left>(A11, A12);
     
     //update block A22
     kernels::gemm(A21, A12, A22, -1);
     
     //call recursively getrf on A22
     getrf_recursive(A, P, split, end);
     
     //permute A21 and update P2 to reflect the full matrix
     swap_rows(P2, A21);
     for(auto& p: P2){
         p += split-start;
     }
 }
 
 template <typename MatA, typename VecP>
 void getrf(
     matrix_expression<MatA, cpu_tag>& A,
     vector_expression<VecP, cpu_tag>& P
 ){
     for(std::size_t i = 0; i != P().size(); ++i){
         P()(i) = i;
     }
     getrf_recursive(A, P, 0, A().size1());
 }
 }}
 #endif