32 #ifndef REMORA_KERNELS_CLBLAS_TRMV_HPP 33 #define REMORA_KERNELS_CLBLAS_TRMV_HPP 36 #include "../../expression_types.hpp" 37 #include "../../detail/traits.hpp" 38 #include <boost/compute/kernel.hpp> 39 #include <boost/compute/detail/meta_kernel.hpp> 40 #include <boost/compute/functional/operator.hpp> 41 #include "../gemv.hpp" 43 namespace remora {
namespace bindings {
46 boost::compute::kernel kernel;
47 std::size_t start_index;
48 std::size_t end_index;
49 std::size_t unit_index;
50 std::size_t upper_index;
53 template<
class MatA,
class VecV>
54 trmv_kernel createTRMVBlockKernel(
55 matrix_expression<MatA, gpu_tag>
const& A,
56 vector_expression<VecV, gpu_tag> &v,
59 typedef typename MatA::value_type value_typeA;
60 typedef typename VecV::value_type value_typeV;
61 boost::compute::multiplies<value_typeV> prod;
63 boost::compute::detail::meta_kernel k(
"blas_trmv");
64 std::size_t start_index = k.add_arg<std::size_t>(
"start");
65 std::size_t end_index = k.add_arg<std::size_t>(
"end");
66 std::size_t unit_index = k.add_arg<std::size_t>(
"unit");
67 std::size_t upper_index = k.add_arg<std::size_t>(
"upper");
71 k <<
"__local " <<k.decl<value_typeA>(
"Asub")<<
"[TILE_SIZE][TILE_SIZE+2];\n";
72 k <<
"__local " <<k.decl<value_typeV>(
"Bsub")<<
"[TILE_SIZE];\n";
73 k <<
"__local " <<k.decl<value_typeV>(
"BResult")<<
"[TILE_SIZE];\n";
74 k <<
" const ulong numWorkers = get_local_size(0);\n";
77 k <<
"const ulong curTileA = end-start;\n";
78 k <<
"for(ulong i = 0; i < curTileA; ++i){\n";
79 k <<
" for(ulong j = get_local_id(0); j < curTileA; j += numWorkers){\n";
80 k <<
" Asub[i][j] ="<< A()(k.expr<cl_ulong>(
"(i+start)"),k.expr<cl_ulong>(
"(j+start)"))<<
";\n";
86 k <<
"for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
87 k <<
" Bsub[i] = "<< v()(k.expr<cl_ulong>(
"(start+i)"))<<
";\n";
90 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
96 k <<
" for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
97 k <<
" BResult[i] = Bsub[i];\n";
98 k <<
" if(!unit){BResult[i] *= Asub[i][i];}\n";
99 k <<
" for(ulong j = 0; j < i; ++j){\n";
100 k <<
" BResult[i] +="<< prod(k.expr<value_typeV>(
"Bsub[j]"), k.expr<value_typeA>(
"Asub[i][j]"))<<
";\n";
105 k <<
" for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
106 k <<
" BResult[i] = Bsub[i];\n";
107 k <<
" if(!unit){BResult[i] *= Asub[i][i];}\n";
108 k <<
" for(ulong j = i+1; j < curTileA; ++j){\n";
109 k <<
" BResult[i] +="<< prod(k.expr<value_typeV>(
"Bsub[j]"), k.expr<value_typeA>(
"Asub[i][j]"))<<
";\n";
114 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
116 k <<
"for(ulong i = 0; i < curTileA; ++i){\n";
117 k << v()(k.expr<cl_ulong>(
"(start+i)"))<<
" = BResult[i];\n";
120 boost::compute::kernel kernel = k.compile(v().queue().get_context(), options);
121 return {kernel,start_index,end_index,unit_index,upper_index};
124 template <
typename MatA,
typename VecV,
typename Triangular>
126 matrix_expression<MatA, gpu_tag>
const& Afull,
127 vector_expression<VecV, gpu_tag> & vfull,
131 std::size_t tileSizeA,
134 std::size_t size = end-start;
137 if(size <= tileSizeA){
139 kernel.kernel.set_arg(kernel.start_index, start);
140 kernel.kernel.set_arg(kernel.end_index, end);
141 kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
142 kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
144 std::size_t global_work_size[2] = {
145 (size+tileSizeA-1) / tileSizeA * tileSizeA,
148 std::size_t local_work_size[2] = {tileSizeA, 1};
149 vfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,
nullptr, global_work_size, local_work_size);
153 std::size_t split = ((size+tileSizeA-1)/tileSizeA)/2 * tileSizeA;
154 vector_range<VecV> vfront(vfull(),start,start+split);
155 vector_range<VecV> vback(vfull(),start+split,end);
157 if(Triangular::is_upper){
158 matrix_range<typename const_expression<MatA>::type > Aur(Afull(),start,start+split,start+split,end);
159 trmv_recursive(Afull, vfull, kernel, start, start+split, tileSizeA, t);
160 kernels::gemv(Aur, vback, vfront, 1.0);
161 trmv_recursive(Afull, vfull, kernel, start+split, end, tileSizeA, t);
163 matrix_range<typename const_expression<MatA>::type> All(Afull(),start+split,end,start,start+split);
164 trmv_recursive(Afull, vfull, kernel, start+split, end, tileSizeA, t);
165 kernels::gemv(All, vfront, vback, 1.0);
166 trmv_recursive(Afull, vfull, kernel, start, start+split, tileSizeA, t);
173 template <
bool Upper,
bool Unit,
typename MatA,
typename VecV>
175 matrix_expression<MatA, gpu_tag>
const& A,
176 vector_expression<VecV, gpu_tag>& v
178 REMORA_SIZE_CHECK(A().size1() == A().size2());
179 REMORA_SIZE_CHECK(A().size2() == v().size());
181 std::size_t
const TileSizeA = 32;
182 char const* options =
"-DTILE_SIZE=32ul";
183 auto kernel = bindings::createTRMVBlockKernel(A,v,options);
185 bindings::trmv_recursive(A,v,kernel,0,A().size1(), TileSizeA, triangular_tag<Upper,Unit>());