include/shark/LinAlg/BLAS/kernels/default/syrk.hpp Source File

Go to the documentation of this file.
 /*!
  *
  *
  * \brief       -
  *
  * \author      O. Krause
  * \date        2016
  *
  *
  * \par Copyright 1995-2015 Shark Development Team
  *
  * <BR><HR>
  * This file is part of Shark.
  * <http://image.diku.dk/shark/>
  *
  * Shark is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * Shark is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
 #ifndef REMORA_KERNELS_DEFAULT_SYRK_HPP
 #define REMORA_KERNELS_DEFAULT_SYRK_HPP
 
 #include "../../expression_types.hpp"//for matrix_expression
 #include "../../detail/matrix_proxy_classes.hpp"//for matrix_range/matrix_transpose
 #include "mgemm.hpp" //block macro kernel for dense syrk
 #include <type_traits> //std::false_type marker for unoptimized, std::common_Type
 
 namespace remora { namespace bindings {
 
 
 template <typename T>
 struct syrk_block_size {
     typedef detail::block<T> block;
     static const unsigned mr = 4; // stripe width for E_left
     static const unsigned nr = mr * block::max_vector_elements; // stripe width for E_right
     static const unsigned lhs_block_size = 3 * mr * nr;//square block size of M to compute
     static const unsigned rhs_k_size = 1024;//strip of ks to compute
 };
 template <class E, class Mat, class Triangular>
 void syrk_impl(
     matrix_expression<E, cpu_tag> const& e,
     matrix_expression<Mat, cpu_tag>& m,
     typename Mat::value_type& alpha,
     Triangular t
 ){
     typedef typename E::value_type value_type;
     typedef syrk_block_size<value_type> block_size;
 
     static const std::size_t MC = block_size::lhs_block_size;
     static const std::size_t EC = block_size::rhs_k_size;
 
     //obtain uninitialized aligned storage
     boost::alignment::aligned_allocator<value_type,block_size::block::align> allocatorE;
     boost::alignment::aligned_allocator<typename Mat::value_type,block_size::block::align> allocatorM;
     value_type* E_left = allocatorE.allocate(MC * EC);
     value_type* E_right = allocatorE.allocate(MC * EC);
     auto M_diagonal_block = allocatorM.allocate(MC * MC);
 
     //figure out number of blocks to use
     const std::size_t  M = e().size1();
     const std::size_t  K = e().size2();
     const std::size_t Mb = (M+MC-1) / MC;//we split m in Mb x Mb blocks
     const std::size_t Eb = (K+EC-1) / EC;//we split B in Mb x Eb blocks
 
     //get access to raw storage of M
     auto storageM = m().raw_storage();
     auto Mpointer = storageM.values;
     const std::size_t stride1 = Mat::orientation::index_M(storageM.leading_dimension,1);
     const std::size_t stride2 = Mat::orientation::index_m(storageM.leading_dimension,1);
 
     for (std::size_t k = 0; k < Eb; ++k) {//column blocks of E
         std::size_t kc = std::min(EC, K - k * EC);
         for (std::size_t i = 0; i < Mb; ++i){//row-blocks of M
             std::size_t mc = std::min(MC, M - i * MC);
             //load block of the left E into memory
             matrix_range<E const> E_lefts(e(), i * MC, i * MC + mc, k*EC, k*EC + kc );
             pack_A_dense(E_lefts, E_left, block_size());
 
             std::size_t start_j = Triangular::is_upper? i : 0;
             std::size_t end_j = Triangular::is_upper? Mb : i+1;
             for(std::size_t j = start_j; j < end_j; ++j){//traverse over the blocks that are to be computed
                 std::size_t mc2 = std::min(MC, M - j * MC);
                 //load block of the right E into memory
                 matrix_range<typename const_expression<E>::type> E_rights(e(), j * MC, j * MC + mc2, k*EC, k*EC + kc );
                 matrix_transpose<matrix_range<typename const_expression<E>::type> > E_rights_trans(E_rights);
                 pack_B_dense(E_rights_trans, E_right, block_size());
 
                 if(i==j){//diagonal block: we have to ensure that we only access elements on the diagonal
                     for(std::size_t i0 = 0; i0 != mc; ++i0){
                         for(std::size_t j0 = 0; j0 != mc2; ++j0){
                             M_diagonal_block[i0*MC+j0] = 0.0;
                         }
                     }
                     mgemm(
                         mc, mc2, kc, alpha, E_left, E_right,
                         M_diagonal_block, MC, 1, block_size()
                     );
                     auto M_diagonal = Mpointer + i * MC * stride1 + j * MC * stride2;
                     for(std::size_t i0 = 0; i0 != mc; ++i0){
                         std::size_t start_j0 = Triangular::is_upper? i0 : 0;
                         std::size_t end_j0 = Triangular::is_upper? mc2 : i0+1;
                         for(std::size_t j0 = start_j0; j0 < end_j0; ++j0){
                             M_diagonal[i0*stride1+j0*stride2] += M_diagonal_block[i0*MC+j0];
                         }
                     }
                 }else{
                     mgemm(
                         mc, mc2, kc, alpha, E_left, E_right,
                         &Mpointer[i*MC * stride1 + j*MC * stride2], stride1, stride2, block_size()
                     );
                 }
             }
         }
     }
     //free storage
     allocatorE.deallocate(E_left,MC * EC);
     allocatorE.deallocate(E_right,MC * EC);
     allocatorM.deallocate(M_diagonal_block, MC * MC);
 }
 
 
 
 //main kernel runs the kernel above recursively and calls gemv
 template <bool Upper, typename M, typename E>
 void syrk(
     matrix_expression<E, cpu_tag> const& e,
     matrix_expression<M, cpu_tag>& m,
     typename M::value_type& alpha,
     std::false_type //unoptimized
 ){
     REMORA_SIZE_CHECK(m().size1() == m().size2());
     REMORA_SIZE_CHECK(m().size2() == e().size1());
 
     syrk_impl(e,m, alpha, triangular_tag<Upper,false>());
 }
 
 }}
 
 #endif