31 #ifndef REMORA_KERNELS_CLBLAS_TRSV_HPP 32 #define REMORA_KERNELS_CLBLAS_TRSV_HPP 35 #include "../../detail/traits.hpp" 36 #include "../../expression_types.hpp" 37 #include "../../detail/matrix_proxy_classes.hpp" 38 #include "../gemv.hpp" 39 #include <boost/compute/kernel.hpp> 40 #include <boost/compute/detail/meta_kernel.hpp> 41 #include <boost/compute/functional/operator.hpp> 45 namespace remora{
namespace bindings {
47 boost::compute::kernel kernel;
48 std::size_t start_index;
49 std::size_t end_index;
50 std::size_t unit_index;
51 std::size_t upper_index;
54 template<
class MatA,
class VecB>
55 trsv_kernel createTRSVDiagBlockKernel(
56 matrix_expression<MatA, gpu_tag>
const& A,
57 vector_expression<VecB, gpu_tag> &b,
60 typedef typename MatA::value_type value_typeA;
61 typedef typename VecB::value_type value_typeB;
62 boost::compute::multiplies<value_typeB> prod;
64 boost::compute::detail::meta_kernel k(
"blas_trsv");
65 std::size_t start_index = k.add_arg<std::size_t>(
"start");
66 std::size_t end_index = k.add_arg<std::size_t>(
"end");
67 std::size_t unit_index = k.add_arg<std::size_t>(
"unit");
68 std::size_t upper_index = k.add_arg<std::size_t>(
"upper");
70 k <<
"__local " <<k.decl<value_typeA>(
"Asub")<<
"[TILE_SIZE][TILE_SIZE+2];\n";
71 k <<
"__local " <<k.decl<value_typeB>(
"Bsub")<<
"[TILE_SIZE];\n";
72 k <<
"const ulong numWorkers = get_local_size(0);\n";
74 k <<
"const ulong curTileA = end-start;\n";
77 k <<
"for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
78 k <<
" for(ulong j = 0; j < TILE_SIZE; j++){\n";
79 k <<
" Asub[i][j] ="<< A()(k.expr<cl_ulong>(
"min(end-1, start + i)"),k.expr<cl_ulong>(
"min(end-1, start + j)"))<<
";\n";
85 k <<
"for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
86 k <<
" Bsub[i] ="<< b()(k.expr<cl_ulong>(
"min(end-1,start + i)"))<<
";\n";
89 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
94 k <<
" for(ulong i = 0; i < TILE_SIZE && get_local_id(0) == 0; ++i){\n";
95 k <<
" if(!unit){Bsub[i] /= Asub[i][i];}\n";
96 k <<
" for(ulong j = i+1; j < TILE_SIZE; ++j){\n";
97 k <<
" Bsub[j] -= "<< prod(k.expr<value_typeB>(
"Bsub[i]"), k.expr<value_typeA>(
"Asub[j][i]"))<<
";\n";
102 k <<
" for(ulong n = curTileA; n > 0 && get_local_id(0) == 0; --n){\n";
103 k <<
" ulong i = n-1;\n";
104 k <<
" if(!unit ){Bsub[i] /= Asub[i][i];}\n";
105 k <<
" for(ulong j = 0; j < i; j ++){\n";
106 k <<
" Bsub[j] -= "<< prod(k.expr<value_typeB>(
"Bsub[i]"), k.expr<value_typeA>(
"Asub[j][i]"))<<
";\n";
111 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
113 k <<
"for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
114 k << b()(k.expr<cl_ulong>(
"(start+i)"))<<
" = Bsub[i];\n";
117 boost::compute::kernel kernel = k.compile(b().queue().get_context(), options);
118 return {kernel,start_index,end_index,unit_index,upper_index};
121 template <
typename MatA,
typename VecB,
class Triangular>
123 matrix_expression<MatA, gpu_tag>
const& Afull,
124 vector_expression<VecB, gpu_tag> & bfull,
128 std::size_t tileSize,
129 std::size_t numWorkers,
133 std::size_t size = end-start;
135 if(size <= tileSize){
137 kernel.kernel.set_arg(kernel.start_index, start);
138 kernel.kernel.set_arg(kernel.end_index, end);
139 kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
140 kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
142 std::size_t global_work_size[2] = {numWorkers,1};
143 std::size_t local_work_size[2] = {numWorkers, 1};
144 bfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,
nullptr, global_work_size, local_work_size);
147 std::size_t numBlocks = (size+tileSize-1)/tileSize;
148 std::size_t split = numBlocks/2 * tileSize;
149 vector_range<VecB> bfront(bfull(),start,start+split);
150 vector_range<VecB> bback(bfull(),start+split,end);
153 if(Triangular::is_upper){
154 matrix_range<typename const_expression<MatA>::type > Aur(Afull(),start,start+split,start+split,end);
155 trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
156 kernels::gemv(Aur, bback, bfront, -1.0);
157 trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
159 matrix_range<typename const_expression<MatA>::type> All(Afull(),start+split,end,start,start+split);
160 trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
161 kernels::gemv(All, bfront, bback, -1.0);
162 trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
166 template <
typename MatA,
typename VecB,
class Triangular>
168 matrix_expression<MatA, gpu_tag>
const& A,
169 vector_expression<VecB, gpu_tag>& b,
173 std::size_t
const TileSize = 32;
174 std::size_t
const numWorkers = TileSize;
175 char const* options =
"-DTILE_SIZE=32ul";
176 auto kernel = bindings::createTRSVDiagBlockKernel(A,b,options);
177 trsv_recursive(A,b,kernel,0,A().size1(), TileSize, numWorkers, Triangular());
180 template <
typename MatA,
typename VecB,
class Triangular>
182 matrix_expression<MatA, gpu_tag>
const& A,
183 vector_expression<VecB, gpu_tag>& b,
187 matrix_transpose<typename const_expression<MatA>::type> transA(A());
188 trsv_call(transA,b,
typename Triangular::transposed_orientation(),left());
193 template <
class Triangular,
class S
ide,
typename MatA,
typename VecB>
195 matrix_expression<MatA, gpu_tag>
const& A,
196 vector_expression<VecB, gpu_tag>& b
198 REMORA_SIZE_CHECK(A().size1() == A().size2());
199 REMORA_SIZE_CHECK(A().size2() == b().size());
200 bindings::trsv_call(A,b,Triangular(), Side());