31 #ifndef REMORA_KERNELS_DEFAULT_MGEMM_HPP 32 #define REMORA_KERNELS_DEFAULT_MGEMM_HPP 38 namespace remora{
namespace bindings {
50 template <
class block_size,
class T,
class TC>
52 std::size_t kc, TC alpha, T
const* A, T
const* B,
53 TC* C, std::size_t stride1, std::size_t stride2
55 BOOST_ALIGN_ASSUME_ALIGNED(A, block_size::block::align);
56 BOOST_ALIGN_ASSUME_ALIGNED(B, block_size::block::align);
58 typedef typename block_size::block::type vx;
59 static const std::size_t vector_length = block_size::block::vector_elements;
60 static const std::size_t vecNR = block_size::nr/vector_length;
61 #ifdef REMORA_USE_SIMD 62 vx P[block_size::mr * vecNR] = {};
64 typename std::aligned_storage<sizeof(vx[block_size::mr*vecNR]),block_size::block::align>::type Pa;
65 T* P =
reinterpret_cast<T*
>(&Pa);
66 for (std::size_t c = 0; c < block_size::mr*vecNR; c++)
73 vx
const* b = (vx
const*)B;
74 for (std::size_t l=0; l<kc; ++l) {
75 for (std::size_t i=0; i<block_size::mr; ++i) {
76 for (std::size_t j=0; j<vecNR; ++j) {
77 P[i * vecNR+j] += A[i]*b[j];
85 for (std::size_t i=0; i<block_size::mr; ++i) {
86 for (std::size_t j=0; j< vecNR; ++j) {
87 P[i*vecNR+j] *= alpha;
93 T
const* p = (T
const*) P;
94 for (std::size_t i=0; i<block_size::mr; ++i) {
95 for (std::size_t j=0; j<block_size::nr; ++j) {
96 C[i * stride1+j * stride2] += p[i*block_size::nr+j];
103 template <
class T,
class TC,
class block_size>
105 std::size_t mc, std::size_t nc, std::size_t kc, TC alpha,
106 T
const* A, T
const* B, TC *C,
107 std::size_t stride1, std::size_t stride2, block_size
109 static std::size_t
const MR = block_size::mr;
110 static std::size_t
const NR = block_size::nr;
111 std::size_t
const mp = (mc+MR-1) / MR;
112 std::size_t
const np = (nc+NR-1) / NR;
114 for (std::size_t j=0; j<np; ++j) {
115 std::size_t
const nr = std::min(NR, nc - j*NR);
117 for (std::size_t i=0; i<mp; ++i) {
118 std::size_t
const mr = std::min(MR, mc - i*MR);
119 auto CBlockStart = C+i*MR*stride1+j*NR*stride2;
120 if (mr==MR && nr==NR) {
123 &A[i*kc*MR], &B[j*kc*NR],
124 CBlockStart, stride1, stride2
127 TC CTempBlock[MR*NR];
128 std::fill_n(CTempBlock, MR*NR, T(0));
131 &A[i*kc*MR], &B[j*kc*NR],
135 for (std::size_t i0=0; i0<mr; ++i0){
136 for (std::size_t j0=0; j0<nr; ++j0) {
137 CBlockStart[i0*stride1+j0 * stride2] += CTempBlock[i0*NR+j0];
147 template <
class E,
class T,
class block_size>
148 void pack_A_dense(matrix_expression<E, cpu_tag>
const& A, T* p, block_size)
150 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
152 std::size_t
const mc = A().size1();
153 std::size_t
const kc = A().size2();
154 static std::size_t
const MR = block_size::mr;
155 const std::size_t mp = (mc+MR-1) / MR;
158 for (std::size_t l=0; l<mp; ++l) {
159 for (std::size_t j=0; j<kc; ++j) {
160 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
161 p[nu] = (i<mc) ? A()(i,j) : T(0);
168 template <
class E,
class T,
class block_size>
169 void pack_B_dense(matrix_expression<E, cpu_tag>
const& B, T* p, block_size)
171 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
173 std::size_t
const kc = B ().size1();
174 std::size_t
const nc = B ().size2();
175 static std::size_t
const NR = block_size::nr;
176 std::size_t
const np = (nc+NR-1) / NR;
179 for (std::size_t l=0; l<np; ++l) {
180 for (std::size_t i=0; i<kc; ++i) {
181 for (std::size_t j = l*NR; j < l*NR + NR; ++j,++nu){
182 p[nu] = (j<nc) ? B()(i,j) : T(0);