31 #ifndef REMORA_KERNELS_DEFAULT_SYRK_HPP 32 #define REMORA_KERNELS_DEFAULT_SYRK_HPP 34 #include "../../expression_types.hpp" 35 #include "../../detail/matrix_proxy_classes.hpp" 37 #include <type_traits> 39 namespace remora {
namespace bindings {
43 struct syrk_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4;
46 static const unsigned nr = mr * block::max_vector_elements;
47 static const unsigned lhs_block_size = 3 * mr * nr;
48 static const unsigned rhs_k_size = 1024;
50 template <
class E,
class Mat,
class Triangular>
52 matrix_expression<E, cpu_tag>
const& e,
53 matrix_expression<Mat, cpu_tag>& m,
54 typename Mat::value_type& alpha,
57 typedef typename E::value_type value_type;
58 typedef syrk_block_size<value_type> block_size;
60 static const std::size_t MC = block_size::lhs_block_size;
61 static const std::size_t EC = block_size::rhs_k_size;
64 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocatorE;
65 boost::alignment::aligned_allocator<typename Mat::value_type,block_size::block::align> allocatorM;
66 value_type* E_left = allocatorE.allocate(MC * EC);
67 value_type* E_right = allocatorE.allocate(MC * EC);
68 auto M_diagonal_block = allocatorM.allocate(MC * MC);
71 const std::size_t M = e().size1();
72 const std::size_t K = e().size2();
73 const std::size_t Mb = (M+MC-1) / MC;
74 const std::size_t Eb = (K+EC-1) / EC;
77 auto storageM = m().raw_storage();
78 auto Mpointer = storageM.values;
79 const std::size_t stride1 = Mat::orientation::index_M(storageM.leading_dimension,1);
80 const std::size_t stride2 = Mat::orientation::index_m(storageM.leading_dimension,1);
82 for (std::size_t k = 0; k < Eb; ++k) {
83 std::size_t kc = std::min(EC, K - k * EC);
84 for (std::size_t i = 0; i < Mb; ++i){
85 std::size_t mc = std::min(MC, M - i * MC);
87 matrix_range<E const> E_lefts(e(), i * MC, i * MC + mc, k*EC, k*EC + kc );
88 pack_A_dense(E_lefts, E_left, block_size());
90 std::size_t start_j = Triangular::is_upper? i : 0;
91 std::size_t end_j = Triangular::is_upper? Mb : i+1;
92 for(std::size_t j = start_j; j < end_j; ++j){
93 std::size_t mc2 = std::min(MC, M - j * MC);
95 matrix_range<typename const_expression<E>::type> E_rights(e(), j * MC, j * MC + mc2, k*EC, k*EC + kc );
96 matrix_transpose<matrix_range<typename const_expression<E>::type> > E_rights_trans(E_rights);
97 pack_B_dense(E_rights_trans, E_right, block_size());
100 for(std::size_t i0 = 0; i0 != mc; ++i0){
101 for(std::size_t j0 = 0; j0 != mc2; ++j0){
102 M_diagonal_block[i0*MC+j0] = 0.0;
106 mc, mc2, kc, alpha, E_left, E_right,
107 M_diagonal_block, MC, 1, block_size()
109 auto M_diagonal = Mpointer + i * MC * stride1 + j * MC * stride2;
110 for(std::size_t i0 = 0; i0 != mc; ++i0){
111 std::size_t start_j0 = Triangular::is_upper? i0 : 0;
112 std::size_t end_j0 = Triangular::is_upper? mc2 : i0+1;
113 for(std::size_t j0 = start_j0; j0 < end_j0; ++j0){
114 M_diagonal[i0*stride1+j0*stride2] += M_diagonal_block[i0*MC+j0];
119 mc, mc2, kc, alpha, E_left, E_right,
120 &Mpointer[i*MC * stride1 + j*MC * stride2], stride1, stride2, block_size()
127 allocatorE.deallocate(E_left,MC * EC);
128 allocatorE.deallocate(E_right,MC * EC);
129 allocatorM.deallocate(M_diagonal_block, MC * MC);
135 template <
bool Upper,
typename M,
typename E>
137 matrix_expression<E, cpu_tag>
const& e,
138 matrix_expression<M, cpu_tag>& m,
139 typename M::value_type& alpha,
142 REMORA_SIZE_CHECK(m().size1() == m().size2());
143 REMORA_SIZE_CHECK(m().size2() == e().size1());
145 syrk_impl(e,m, alpha, triangular_tag<Upper,false>());