copy.hpp
Go to the documentation of this file.
1 /*!
2  * \brief Implements operations to copy data from cpu to gpu and back
3  *
4  * \author O. Krause
5  * \date 2016
6  *
7  *
8  * \par Copyright 1995-2015 Shark Development Team
9  *
10  * <BR><HR>
11  * This file is part of Shark.
12  * <http://image.diku.dk/shark/>
13  *
14  * Shark is free software: you can redistribute it and/or modify
15  * it under the terms of the GNU Lesser General Public License as published
16  * by the Free Software Foundation, either version 3 of the License, or
17  * (at your option) any later version.
18  *
19  * Shark is distributed in the hope that it will be useful,
20  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22  * GNU Lesser General Public License for more details.
23  *
24  * You should have received a copy of the GNU Lesser General Public License
25  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
26  *
27  */
28 #ifndef REMORA_GPU_COPY_HPP
29 #define REMORA_GPU_COPY_HPP
30 
31 #include "../detail/traits.hpp"
32 //includes required for storage->vector/matrix and for scalar multiplication
33 #include "../detail/vector_proxy_classes.hpp"
34 #include "../detail/vector_expression_classes.hpp"
35 #include "../detail/matrix_proxy_classes.hpp"
36 #include "../detail/matrix_expression_classes.hpp"
37 
38 namespace remora{
39 
40 ///////////////////////////////////////
41 //////// Vector Transport
42 ///////////////////////////////////////
43 
44 template<class E>
45 class vector_transport_to_cpu: public vector_expression<vector_transport_to_cpu<E>, cpu_tag>{
46 public:
47  typedef typename E::const_closure_type expression_closure_type;
48 
49  typedef typename E::value_type value_type;
50  typedef typename E::size_type size_type;
51  typedef value_type const& const_reference;
52  typedef const_reference reference;
53 
54  typedef vector_transport_to_cpu const_closure_type;
55  typedef vector_transport_to_cpu closure_type;
56  typedef unknown_storage storage_type;
57  typedef unknown_storage const_storage_type;
58  typedef blockwise<typename E::evaluation_category::tag> evaluation_category;
59 
60 
61  //FIXME: This is required even though iterators for block expressions are meaningless
62  typedef typename E::const_iterator const_iterator;
63  typedef const_iterator iterator;
64 
65  // Construction and destruction
66  explicit vector_transport_to_cpu(
67  expression_closure_type const& expression
68  ):m_expression(expression){}
69 
70  size_type size() const {
71  return m_expression.size();
72  }
73 
74  expression_closure_type const& expression() const {
75  return m_expression;
76  }
77  boost::compute::command_queue& queue() const{
78  return m_expression.queue();
79  }
80 
81  //dispatcher to computation kernels
82  template<class VecX>
83  void assign_to(vector_expression<VecX, cpu_tag>& x, value_type const& alpha = value_type(1) )const{
84  assign_to(x, m_expression, alpha, typename E::storage_type::storage_tag());
85  }
86  template<class VecX>
87  void plus_assign_to(vector_expression<VecX, cpu_tag>& x, value_type const& alpha = value_type(1) )const{
88  plus_assign_to(x, m_expression, alpha, typename E::storage_type::storage_tag());
89  }
90 
91  template<class VecX>
92  void minus_assign_to(vector_expression<VecX, cpu_tag>& x, value_type alpha = value_type(1) )const{
93  plus_assign_to(x,-alpha);
94  }
95 
96 private:
97  //result is represented as dense storage on gpu, i.e. does not need to be calculated
98  template<class VecX, class VecE>
99  void assign_to(
100  vector_expression<VecX, cpu_tag>& x, vector_expression<VecE, gpu_tag> const& e,
101  value_type const& alpha, dense_tag
102  )const{
103  auto storageE = e().raw_storage();
104  auto& buffer = storageE.buffer;
105  //map buffer to host memory
106  typename VecE::value_type* p = (typename VecE::value_type*) e().queue().enqueue_map_buffer(
107  buffer, CL_MAP_READ, 0, buffer.size()
108  );
109  //adapt host memory buffer to vector and assign
110  typedef dense_vector_adaptor<typename VecE::value_type> AdaptE;
111  AdaptE adaptE(p + storageE.offset,size(), storageE.stride);
112  assign(x, vector_scalar_multiply<AdaptE >( adaptE, alpha));
113 
114  //unmap memory
115  e().queue().enqueue_unmap_buffer(buffer,p);
116  }
117 
118  template<class VecX, class VecE>
119  void plus_assign_to(
120  vector_expression<VecX, cpu_tag>& x, vector_expression<VecE, gpu_tag> const& e,
121  value_type const& alpha, dense_tag
122  )const{
123  auto storageE = e().raw_storage();
124  auto& buffer = storageE.buffer;
125  //map buffer to host memory
126  typename VecE::value_type* p = (typename VecE::value_type*) e().queue().enqueue_map_buffer(
127  buffer, CL_MAP_READ, 0, buffer.size()
128  );
129  //adapt host memory buffer to vector and assign
130  typedef dense_vector_adaptor<typename VecE::value_type> AdaptE;
131  AdaptE adaptE(p + storageE.offset,size(), storageE.stride);
132 
133  plus_assign(x,vector_scalar_multiply<AdaptE >( adaptE, alpha));
134 
135  //unmap memory
136  e().queue().enqueue_unmap_buffer(buffer,p);
137  }
138 
139  //result has unknown storage, so evaluate into temporary on gpu and assign that to host
140  template<class VecX, class VecE>
141  void assign_to(
142  vector_expression<VecX, cpu_tag>& x, vector_expression<VecE, gpu_tag> const& e,
143  value_type const& alpha, unknown_tag
144  )const{
145  //evaluate expression on gpu and assign that to host
146  typedef typename vector_temporary<E>::type result_type;
147  result_type result = m_expression;
148 
149  assign_to(x, result, alpha, typename result_type::storage_type::storage_tag());
150  }
151 
152  template<class VecX, class VecE>
153  void plus_assign_to(
154  vector_expression<VecX, cpu_tag>& x, vector_expression<VecE, gpu_tag> const& e,
155  value_type const& alpha, unknown_tag
156  )const{
157  //evaluate expression on gpu and assign that to host
158  typedef typename vector_temporary<E>::type result_type;
159  result_type result = m_expression;
160 
161  plus_assign_to(x, result, alpha, typename result_type::storage_type::storage_tag());
162  }
163  expression_closure_type m_expression;
164 };
165 
166 template<class E>
167 class vector_transport_to_gpu: public vector_expression<vector_transport_to_gpu<E>, gpu_tag>{
168 public:
169  typedef typename E::const_closure_type expression_closure_type;
170 
171  typedef typename E::value_type value_type;
172  typedef typename E::size_type size_type;
173  typedef value_type const& const_reference;
174  typedef const_reference reference;
175 
176  typedef vector_transport_to_gpu const_closure_type;
177  typedef vector_transport_to_gpu closure_type;
178  typedef unknown_storage storage_type;
179  typedef unknown_storage const_storage_type;
180  typedef blockwise<typename E::evaluation_category::tag> evaluation_category;
181 
182 
183  //FIXME: This is required even though iterators for block expressions are meaningless
184  typedef typename E::const_iterator const_iterator;
185  typedef const_iterator iterator;
186 
187  // Construction and destruction
188  explicit vector_transport_to_gpu(
189  expression_closure_type const& expression,
190  boost::compute::command_queue& queue
191  ):m_expression(expression), m_queue(&queue){}
192 
193  size_type size() const {
194  return m_expression.size();
195  }
196  expression_closure_type const& expression() const {
197  return m_expression;
198  }
199  boost::compute::command_queue& queue() const{
200  return *m_queue;
201  }
202 
203  //dispatcher to computation kernels
204  template<class VecX>
205  void assign_to(vector_expression<VecX, gpu_tag>& x, value_type const& alpha = value_type(1) )const{
206  assign_to(x, m_expression, alpha, typename E::storage_type::storage_tag());
207  }
208  template<class VecX>
209  void plus_assign_to(vector_expression<VecX, gpu_tag>& x, value_type const& alpha = value_type(1) )const{
210  plus_assign_to(x, m_expression, alpha, typename E::storage_type::storage_tag());
211  }
212 
213  template<class VecX>
214  void minus_assign_to(vector_expression<VecX, gpu_tag>& x, value_type alpha = value_type(1) )const{
215  plus_assign_to(x,-alpha);
216  }
217 
218 private:
219  //result is represented as dense storage on cpu, i.e. does not need to be calculated
220  template<class VecX, class VecE>
221  void assign_to(
222  vector_expression<VecX, gpu_tag>& x, vector_expression<VecE, cpu_tag> const& e,
223  value_type const& alpha, dense_tag
224  )const{
225  auto storagex = x().raw_storage();
226  auto& buffer = storagex.buffer;
227  //map buffer to host memory
228  typename VecX::value_type* p = (typename VecX::value_type*) x().queue().enqueue_map_buffer(
229  buffer, CL_MAP_WRITE, 0, buffer.size()
230  );
231  //adapt host memory buffer to vector and assign
232  dense_vector_adaptor<typename VecX::value_type> adaptX(p + storagex.offset,size(), storagex.stride);
233  assign(adaptX,vector_scalar_multiply<expression_closure_type>(m_expression,alpha));
234 
235  //unmap memory
236  x().queue().enqueue_unmap_buffer(buffer,p);
237  }
238 
239  template<class VecX, class VecE>
240  void plus_assign_to(
241  vector_expression<VecX, gpu_tag>& x, vector_expression<VecE, cpu_tag> const& e,
242  value_type const& alpha, dense_tag
243  )const{
244  auto storagex = x().raw_storage();
245  auto& buffer = storagex.buffer;
246  //map buffer to host memory
247  typename VecX::value_type* p = (typename VecX::value_type*) x().queue().enqueue_map_buffer(
248  buffer, CL_MAP_WRITE, storagex.offset, buffer.size() - storagex.offset
249  );
250  //adapt host memory buffer to vector and assign
251  dense_vector_adaptor<typename VecX::value_type> adaptX(p,size(), storagex.stride);
252  plus_assign(adaptX,vector_scalar_multiply<expression_closure_type>(m_expression,alpha));
253 
254  //unmap memory
255  x().queue().enqueue_unmap_buffer(buffer,p);
256  }
257 
258  expression_closure_type m_expression;
259  boost::compute::command_queue* m_queue;
260 };
261 
262 
263 ///////////////////////////////////////
264 //////// Matrix Transport
265 ///////////////////////////////////////
266 
267 template<class E>
268 class matrix_transport_to_cpu: public matrix_expression<matrix_transport_to_cpu<E>, cpu_tag>{
269 public:
270  typedef typename E::const_closure_type expression_closure_type;
271 
272  typedef typename E::value_type value_type;
273  typedef typename E::size_type size_type;
274  typedef value_type const& const_reference;
275  typedef const_reference reference;
276 
277  typedef matrix_transport_to_cpu const_closure_type;
278  typedef matrix_transport_to_cpu closure_type;
279  typedef unknown_storage storage_type;
280  typedef unknown_storage const_storage_type;
281  typedef blockwise<typename E::evaluation_category::tag> evaluation_category;
282  typedef typename E::orientation orientation;
283 
284  //FIXME: This is required even though iterators for block expressions are meaningless
285  typedef typename E::const_row_iterator const_row_iterator;
286  typedef typename E::const_column_iterator const_column_iterator;
287  typedef const_row_iterator row_iterator;
288  typedef const_column_iterator column_iterator;
289 
290  // Construction and destruction
291  explicit matrix_transport_to_cpu(
292  expression_closure_type const& expression
293  ):m_expression(expression){}
294 
295  size_type size1() const {
296  return m_expression.size1();
297  }
298  size_type size2() const {
299  return m_expression.size2();
300  }
301  expression_closure_type const& expression() const {
302  return m_expression;
303  }
304  boost::compute::command_queue& queue() const{
305  return m_expression.queue();
306  }
307 
308  //dispatcher to computation kernels
309  template<class MatX>
310  void assign_to(matrix_expression<MatX, cpu_tag>& X, value_type const& alpha = value_type(1) )const{
311  assign_to(X, m_expression, alpha, typename E::storage_type::storage_tag());
312  }
313  template<class MatX>
314  void plus_assign_to(matrix_expression<MatX, cpu_tag>& X, value_type const& alpha = value_type(1) )const{
315  plus_assign_to(X, m_expression, alpha, typename E::storage_type::storage_tag());
316  }
317 
318  template<class MatX>
319  void minus_assign_to(matrix_expression<MatX, cpu_tag>& X, value_type alpha = value_type(1) )const{
320  plus_assign_to(X,-alpha);
321  }
322 
323 private:
324  //result is represented as dense storage on gpu, i.e. does not need to be calculated
325  template<class MatX, class MatE>
326  void assign_to(
327  matrix_expression<MatX, cpu_tag>& X, matrix_expression<MatE, gpu_tag>const& e,
328  value_type const& alpha, dense_tag
329  )const{
330  auto storageE = e().raw_storage();
331  auto& buffer = storageE.buffer;
332  //map buffer to host memory
333  typename MatE::value_type* p = (typename MatE::value_type*) e().queue().enqueue_map_buffer(
334  buffer, CL_MAP_READ, 0, buffer.size()
335  );
336  //adapt host memory buffer to matrix and assign
337  typedef typename MatE::orientation EOrientation;
338  std::size_t stride1 = EOrientation::index_M(storageE.leading_dimension,1);
339  std::size_t stride2 = EOrientation::index_m(storageE.leading_dimension,1);
340  typedef dense_matrix_adaptor<typename MatE::value_type, EOrientation> AdaptE;
341  AdaptE adaptE(p + storageE.offset,size1(), size2(), stride1,stride2);
342 
343  assign(X, matrix_scalar_multiply<AdaptE >( adaptE, alpha));
344 
345  //unmap memory
346  e().queue().enqueue_unmap_buffer(buffer,p);
347  }
348 
349  template<class MatX, class MatE>
350  void plus_assign_to(
351  matrix_expression<MatX, cpu_tag>& X, matrix_expression<MatE, gpu_tag> const& e,
352  value_type const& alpha, dense_tag
353  )const{
354  auto storageE = e().raw_storage();
355  auto& buffer = storageE.buffer;
356  //map buffer to host memory
357  typename MatE::value_type* p = (typename MatE::value_type*) e().queue().enqueue_map_buffer(
358  buffer, CL_MAP_READ, 0, buffer.size()
359  );
360  //adapt host memory buffer to matrix and assign
361  typedef typename MatE::orientation EOrientation;
362  std::size_t stride1 = EOrientation::index_M(storageE.leading_dimension,1);
363  std::size_t stride2 = EOrientation::index_m(storageE.leading_dimension,1);
364  typedef dense_matrix_adaptor<typename MatE::value_type, EOrientation> AdaptE;
365  AdaptE adaptE(p + storageE.offset, size1(), size2(), stride1,stride2);
366 
367  plus_assign(X,matrix_scalar_multiply<AdaptE >( adaptE, alpha));
368 
369  //unmap memory
370  e().queue().enqueue_unmap_buffer(buffer,p);
371  }
372 
373  //result has unknown storage, so evaluate into temporary on gpu and assign that to host
374  template<class MatX, class MatE>
375  void assign_to(
376  matrix_expression<MatX, cpu_tag>& X, matrix_expression<MatE, gpu_tag>const& e,
377  value_type const& alpha, unknown_tag
378  )const{
379  //evaluate expression on gpu and assign that to host
380  typedef typename matrix_temporary<E>::type result_type;
381  result_type result = m_expression;
382 
383  assign_to(X, result, alpha, typename result_type::storage_type::storage_tag());
384  }
385 
386  template<class MatX, class MatE>
387  void plus_assign_to(
388  matrix_expression<MatX, cpu_tag>& X, matrix_expression<MatE, gpu_tag>const& e,
389  value_type const& alpha, unknown_tag
390  )const{
391  //evaluate expression on gpu and assign that to host
392  typedef typename matrix_temporary<E>::type result_type;
393  result_type result = m_expression;
394 
395  plus_assign_to(X, result, alpha, typename result_type::storage_type::storage_tag());
396  }
397 
398  expression_closure_type m_expression;
399 };
400 
401 template<class E>
402 class matrix_transport_to_gpu: public matrix_expression<matrix_transport_to_gpu<E>, gpu_tag>{
403 public:
404  typedef typename E::const_closure_type expression_closure_type;
405 
406  typedef typename E::value_type value_type;
407  typedef typename E::size_type size_type;
408  typedef value_type const& const_reference;
409  typedef const_reference reference;
410 
411  typedef matrix_transport_to_gpu const_closure_type;
412  typedef matrix_transport_to_gpu closure_type;
413  typedef unknown_storage storage_type;
414  typedef unknown_storage const_storage_type;
415  typedef blockwise<typename E::evaluation_category::tag> evaluation_category;
416  typedef typename E::orientation orientation;
417 
418  //FIXME: This is required even though iterators for block expressions are meaningless
419  typedef typename E::const_row_iterator const_row_iterator;
420  typedef typename E::const_column_iterator const_column_iterator;
421  typedef const_row_iterator row_iterator;
422  typedef const_column_iterator column_iterator;
423 
424  // Construction and destruction
425  explicit matrix_transport_to_gpu(
426  expression_closure_type const& expression,
427  boost::compute::command_queue& queue
428  ):m_expression(expression), m_queue(&queue){}
429 
430  size_type size1() const {
431  return m_expression.size1();
432  }
433  size_type size2() const {
434  return m_expression.size2();
435  }
436  expression_closure_type const& expression() const {
437  return m_expression;
438  }
439  boost::compute::command_queue& queue() const{
440  return *m_queue;
441  }
442 
443  //dispatcher to computation kernels
444  template<class MatX>
445  void assign_to(matrix_expression<MatX, gpu_tag>& X, value_type const& alpha = value_type(1) )const{
446  assign_to(X, m_expression, alpha, typename E::storage_type::storage_tag());
447  }
448  template<class MatX>
449  void plus_assign_to(matrix_expression<MatX, gpu_tag>& X, value_type const& alpha = value_type(1) )const{
450  plus_assign_to(X, m_expression, alpha, typename E::storage_type::storage_tag());
451  }
452 
453  template<class MatX>
454  void minus_assign_to(matrix_expression<MatX, cpu_tag>& X, value_type alpha = value_type(1) )const{
455  plus_assign_to(X,-alpha);
456  }
457 
458 private:
459  //result is represented as dense storage on gpu, i.e. does not need to be calculated
460  template<class MatX, class MatE>
461  void assign_to(
462  matrix_expression<MatX, gpu_tag>& X, matrix_expression<MatE, cpu_tag> const& e,
463  value_type const& alpha, dense_tag
464  )const{
465  auto storageX = X().raw_storage();
466  auto& buffer = storageX.buffer;
467  //map buffer to host memory
468  typename MatX::value_type* p = (typename MatX::value_type*) X().queue().enqueue_map_buffer(
469  buffer, CL_MAP_WRITE, 0, buffer.size()
470  );
471  //adapt host memory buffer to vector and assign
472  typedef typename MatX::orientation XOrientation;
473  std::size_t stride1 = XOrientation::index_M(storageX.leading_dimension, 1);
474  std::size_t stride2 = XOrientation::index_m(storageX.leading_dimension, 1);
475  dense_matrix_adaptor<typename MatX::value_type, XOrientation> adaptX(p, size1(), size2(), stride1, stride2);
476  assign(adaptX,matrix_scalar_multiply<MatE>(e(),alpha));
477 
478  //unmap memory
479  X().queue().enqueue_unmap_buffer(buffer,p);
480  }
481 
482  template<class MatX, class MatE>
483  void plus_assign_to(
484  matrix_expression<MatX, gpu_tag>& X, matrix_expression<MatE, cpu_tag> const& e,
485  value_type const& alpha, dense_tag
486  )const{
487  auto storageX = X().raw_storage();
488  auto& buffer = storageX.buffer;
489  //map buffer to host memory
490  typename MatX::value_type* p = (typename MatX::value_type*) X().queue().enqueue_map_buffer(
491  buffer, CL_MAP_WRITE, 0, buffer.size()
492  );
493  //adapt host memory buffer to matrix and assign
494  typedef typename MatX::orientation XOrientation;
495  std::size_t stride1 = XOrientation::index_M(storageX.leading_dimension, 1);
496  std::size_t stride2 = XOrientation::index_m(storageX.leading_dimension, 1);
497  typedef dense_matrix_adaptor<typename MatX::value_type, XOrientation> AdaptX;
498  AdaptX adaptX(p + storageX.offset, size1(), size2(), stride1, stride2);
499 
500  plus_assign(adaptX,matrix_scalar_multiply<MatE >( e(), alpha));
501 
502  //unmap memory
503  X().queue().enqueue_unmap_buffer(buffer,p);
504  }
505 
506  expression_closure_type m_expression;
507  boost::compute::command_queue* m_queue;
508 };
509 
510 
511 ///////////////////////////////////////////////
512 ////////Proxy expressions
513 ///////////////////////////////////////////////
514 
515 template<class E>
516 vector_transport_to_cpu<E> copy_to_cpu(vector_expression<E, gpu_tag> const& e){
517  return vector_transport_to_cpu<E>(e());
518 }
519 
520 template<class E>
521 matrix_transport_to_cpu<E> copy_to_cpu(matrix_expression<E, gpu_tag> const& e){
522  return matrix_transport_to_cpu<E>(e());
523 }
524 template<class E>
525 vector_transport_to_gpu<E> copy_to_gpu(
526  vector_expression<E, cpu_tag> const& e,
527  boost::compute::command_queue& queue = boost::compute::system::default_queue()
528 ){
529  return vector_transport_to_gpu<E>(e(), queue);
530 }
531 
532 template<class E>
533 matrix_transport_to_gpu<E> copy_to_gpu(
534  matrix_expression<E, cpu_tag> const& e,
535  boost::compute::command_queue& queue = boost::compute::system::default_queue()
536 ){
537  return matrix_transport_to_gpu<E>(e(),queue);
538 }
539 
540 
541 //moving gpu->gpu is for free
542 template<class E>
543 E const& copy_to_gpu(
544  vector_expression<E, gpu_tag> const& e,
545  boost::compute::command_queue& queue = boost::compute::system::default_queue()
546 ){
547  return e();
548 }
549 
550 template<class E>
551 E const& copy_to_gpu(
552  matrix_expression<E, gpu_tag> const& e,
553  boost::compute::command_queue& queue = boost::compute::system::default_queue()
554 ){
555  return e();
556 }
557 
558 }
559 
560 #endif