include/shark/LinAlg/BLAS/kernels/default/mgemm.hpp Source File

Go to the documentation of this file.
 /*!
  *
  *
  * \brief       The mgemm macro kernel used for implementing gemm
  *
  * \author      O. Krause
  * \date        2016
  *
  *
  * \par Copyright 1995-2015 Shark Development Team
  *
  * <BR><HR>
  * This file is part of Shark.
  * <http://image.diku.dk/shark/>
  *
  * Shark is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * Shark is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
  *
  */
 
 #ifndef REMORA_KERNELS_DEFAULT_MGEMM_HPP
 #define REMORA_KERNELS_DEFAULT_MGEMM_HPP
 
 #include "simd.hpp"
 #include <algorithm>//std::fill
 
 
 namespace remora{namespace bindings {
 
 //  Block-GEMM implementation based on boost.ublas
 //  written by:
 //  Copyright (c) 2016
 //  Michael Lehn, Imre Palik
 //
 //  Distributed under the Boost Software License, Version 1.0. (See
 //  accompanying file LICENSE_1_0.txt or copy at
 //  http://www.boost.org/LICENSE_1_0.txt)
 
 //-- Micro Kernel For Dense operations----------------------------------------------------------
 template <class block_size, class T, class TC>
 void ugemm(
     std::size_t kc, TC alpha, T const* A, T const* B,
     TC* C, std::size_t stride1, std::size_t stride2
 ){
     BOOST_ALIGN_ASSUME_ALIGNED(A, block_size::block::align);
     BOOST_ALIGN_ASSUME_ALIGNED(B, block_size::block::align);
 
     typedef typename block_size::block::type vx;
     static const std::size_t vector_length = block_size::block::vector_elements;
     static const std::size_t vecNR = block_size::nr/vector_length;
 #ifdef REMORA_USE_SIMD
     vx P[block_size::mr * vecNR] = {};
 #else
     typename std::aligned_storage<sizeof(vx[block_size::mr*vecNR]),block_size::block::align>::type Pa;
     T* P = reinterpret_cast<T*>(&Pa);
     for (std::size_t c = 0; c < block_size::mr*vecNR; c++)
         P[c] = 0;
 #endif
 
 
     // perform the matrix-matrix product as outer product
     // of rows of A and B
     vx const* b = (vx const*)B;
     for (std::size_t l=0; l<kc; ++l) {
         for (std::size_t i=0; i<block_size::mr; ++i) {
             for (std::size_t j=0; j<vecNR; ++j) {
                 P[i * vecNR+j] += A[i]*b[j];
             }
         }
         A += block_size::mr;
         b += vecNR;
     }
     //multiply with alpha if necessary
     if (alpha!=TC(1)) {
         for (std::size_t i=0; i<block_size::mr; ++i) {
             for (std::size_t j=0; j< vecNR; ++j) {
                 P[i*vecNR+j] *= alpha;
             }
         }
     }
 
     //add result to C
     T const* p = (T const*) P;
     for (std::size_t i=0; i<block_size::mr; ++i) {
         for (std::size_t j=0; j<block_size::nr; ++j) {
             C[i * stride1+j * stride2] += p[i*block_size::nr+j];
         }
     }
 }
 
 
 // Macro Kernel for two densly packed Blocks
 template <class T, class TC, class block_size>
 void mgemm(
     std::size_t mc, std::size_t nc, std::size_t kc, TC alpha,
     T const* A, T const* B, TC *C,
     std::size_t stride1, std::size_t stride2, block_size
 ){
     static std::size_t const MR = block_size::mr;
     static std::size_t const NR = block_size::nr;
     std::size_t const mp  = (mc+MR-1) / MR;
     std::size_t const np  = (nc+NR-1) / NR;
 
     for (std::size_t j=0; j<np; ++j) {
         std::size_t const nr = std::min(NR, nc - j*NR);
 
         for (std::size_t i=0; i<mp; ++i) {
             std::size_t const mr = std::min(MR, mc - i*MR);
             auto CBlockStart = C+i*MR*stride1+j*NR*stride2;
             if (mr==MR && nr==NR) {
                 ugemm<block_size>(
                     kc, alpha,
                     &A[i*kc*MR], &B[j*kc*NR],
                     CBlockStart, stride1, stride2
                 );
             } else {
                 TC CTempBlock[MR*NR];
                 std::fill_n(CTempBlock, MR*NR, T(0));
                 ugemm<block_size>(
                     kc, alpha,
                     &A[i*kc*MR], &B[j*kc*NR],
                     CTempBlock, NR, 1
                 );
 
                 for (std::size_t i0=0; i0<mr; ++i0){
                     for (std::size_t j0=0; j0<nr; ++j0) {
                         CBlockStart[i0*stride1+j0 * stride2] += CTempBlock[i0*NR+j0];
                     }
                 }
             }
         }
     }
 }
 
 
 //-- Packing blocks ------------------------------------------------------------
 template <class E, class T, class block_size>
 void pack_A_dense(matrix_expression<E, cpu_tag> const& A, T* p, block_size)
 {
     BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
 
     std::size_t const mc = A().size1();
     std::size_t const kc = A().size2();
     static std::size_t const MR = block_size::mr;
     const std::size_t mp = (mc+MR-1) / MR;
 
     std::size_t nu = 0;
     for (std::size_t l=0; l<mp; ++l) {
         for (std::size_t j=0; j<kc; ++j) {
             for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
                 p[nu] = (i<mc) ? A()(i,j) : T(0);
             }
         }
     }
 }
 
 
 template <class E, class T, class block_size>
 void pack_B_dense(matrix_expression<E, cpu_tag> const& B, T* p, block_size)
 {
     BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
 
     std::size_t const kc = B ().size1();
     std::size_t const nc = B ().size2();
     static std::size_t const NR = block_size::nr;
     std::size_t const np = (nc+NR-1) / NR;
 
     std::size_t nu = 0;
         for (std::size_t l=0; l<np; ++l) {
         for (std::size_t i=0; i<kc; ++i) {
             for (std::size_t j = l*NR; j < l*NR + NR; ++j,++nu){
                 p[nu] = (j<nc) ? B()(i,j) : T(0);
             }
         }
         }
 }
 
 }}
 
 #endif