31 #ifndef REMORA_KERNELS_DEFAULT_TRMM_HPP 32 #define REMORA_KERNELS_DEFAULT_TRMM_HPP 34 #include "../../expression_types.hpp" 35 #include "../../detail/traits.hpp" 36 #include "../../detail/matrix_proxy_classes.hpp" 39 #include <type_traits> 40 namespace remora{
namespace bindings {
43 struct trmm_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4;
46 static const unsigned nr = 3 * block::max_vector_elements;
47 static const unsigned lhs_block_size = 32 * mr;
48 static const unsigned rhs_column_size = (1024 / nr) * nr;
49 static const unsigned align = 64;
52 template <
class E,
class T,
class block_size,
bool unit>
53 void pack_A_triangular(matrix_expression<E, cpu_tag>
const& A, T* p, block_size, triangular_tag<false,unit>)
55 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
57 std::size_t
const mc = A().size1();
58 std::size_t
const kc = A().size2();
59 static std::size_t
const MR = block_size::mr;
60 const std::size_t mp = (mc+MR-1) / MR;
63 for (std::size_t l=0; l<mp; ++l) {
64 for (std::size_t j=0; j<kc; ++j) {
65 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
69 p[nu] = ((i<mc) && (i >= j)) ? A()(i,j) : T(0);
75 template <
class E,
class T,
class block_size,
bool unit>
76 void pack_A_triangular(matrix_expression<E, cpu_tag>
const& A, T* p, block_size, triangular_tag<true,unit>)
78 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
80 std::size_t
const mc = A().size1();
81 std::size_t
const kc = A().size2();
82 static std::size_t
const MR = block_size::mr;
83 const std::size_t mp = (mc+MR-1) / MR;
86 for (std::size_t l=0; l<mp; ++l) {
87 for (std::size_t j=0; j<kc; ++j) {
88 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
92 p[nu] = ((i<mc) && (i <= j)) ? A()(i,j) : T(0);
99 template <
class E1,
class E2,
class Triangular>
101 matrix_expression<E1, cpu_tag>
const& e1,
102 matrix_expression<E2, cpu_tag> & e2,
105 typedef typename std::common_type<typename E1::value_type, typename E2::value_type>::type value_type;
106 typedef trmm_block_size<value_type> block_size;
108 static const std::size_t AC = block_size::lhs_block_size;
109 static const std::size_t BC = block_size::rhs_column_size;
112 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
113 value_type* A = allocator.allocate(AC * AC);
114 value_type* B = allocator.allocate(AC * BC);
117 const std::size_t M = e1().size1();
118 const std::size_t N = e2().size2();
119 const std::size_t Ab = (M+AC-1) / AC;
120 const std::size_t Bb = (N+BC-1) / BC;
123 auto storageB = e2().raw_storage();
124 auto Bpointer = storageB.values;
125 const std::size_t stride1 = E2::orientation::index_M(storageB.leading_dimension,1);
126 const std::size_t stride2 = E2::orientation::index_m(storageB.leading_dimension,1);
128 for (std::size_t j = 0; j < Bb; ++j) {
129 std::size_t nc = std::min(BC, N - j * BC);
134 for (std::size_t k0 = 0; k0< Ab; ++k0){
135 std::size_t k = Triangular::is_upper? k0: Ab-k0-1;
136 std::size_t kc = std::min(AC, M - k * AC);
138 matrix_range<E2> Bs(e2(), k*AC, k*AC + kc, j * BC, j * BC + nc );
139 pack_B_dense(Bs, B, block_size());
144 std::size_t i_start = Triangular::is_upper? 0: k;
145 std::size_t i_end = Triangular::is_upper? k+1: Ab;
146 for (std::size_t i = i_start; i < i_end; ++i) {
147 std::size_t mc = std::min(AC, M - i * AC);
148 matrix_range<typename const_expression<E1>::type> As(e1(), i * AC, i * AC + mc, k * AC, k * AC + kc);
150 pack_A_triangular(As, A, block_size(), t);
152 pack_A_dense(As, A, block_size());
155 mc, nc, kc, value_type(1.0), A, B,
156 &Bpointer[i*AC * stride1 + j*BC * stride2], stride1, stride2, block_size()
162 allocator.deallocate(A,AC * AC);
163 allocator.deallocate(B,AC * BC);
169 template <
bool Upper,
bool Unit,
typename MatA,
typename MatB>
171 matrix_expression<MatA, cpu_tag>
const& A,
172 matrix_expression<MatB, cpu_tag>& B,
175 REMORA_SIZE_CHECK(A().size1() == A().size2());
176 REMORA_SIZE_CHECK(A().size2() == B().size1());
178 trmm_impl(A,B, triangular_tag<Upper,Unit>());