conv2d.hpp
Go to the documentation of this file.
1 /*!
2  *
3  *
4  * \brief Implements the 2D convolution kernel for cpus
5  *
6  * \author O. Krause
7  * \date 2016
8  *
9  *
10  * \par Copyright 1995-2015 Shark Development Team
11  *
12  * <BR><HR>
13  * This file is part of Shark.
14  * <http://image.diku.dk/shark/>
15  *
16  * Shark is free software: you can redistribute it and/or modify
17  * it under the terms of the GNU Lesser General Public License as published
18  * by the Free Software Foundation, either version 3 of the License, or
19  * (at your option) any later version.
20  *
21  * Shark is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24  * GNU Lesser General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28  *
29  */
30 
31 #ifndef REMORA_KERNELS_DEFAULT_Conv2D_HPP
32 #define REMORA_KERNELS_DEFAULT_Conv2D_HPP
33 
34 #include "simd.hpp"
35 #include "../gemm.hpp"
36 #include <type_traits> //for std::common_type and aligned storage
37 namespace remora{namespace bindings {
38 
39 
40 /// \brief Transforms the given image into a row-major format for convolution
41 ///
42 /// The resulting matrix has one row for each output of the convolution.
43 /// each row contains the patch used for computing the result.
44 template<class E1, class E2>
45 void im2mat(
46  vector_expression<E1, cpu_tag> const& image,
47  matrix_expression<E2, cpu_tag>& output,
48  std::size_t num_channels,
49  std::size_t image_height,
50  std::size_t image_width,
51  std::size_t filter_height,
52  std::size_t filter_width
53 ){
54  //the order of loops is chosen such, that only very little changes of rows are performed
55  for(std::size_t c = 0; c != num_channels; ++c){//iterate over the channels
56  for(std::size_t i = 0; i != image_height - filter_height +1; ++i){// iterate over row-positions in the image
57  for(std::size_t i1 = 0; i1 != filter_height; ++i1){//iterate over the the rows of the current filter
58  for(std::size_t j = 0; j != image_width - filter_width +1; ++j){//iterate over the column-position in the image
59  std::size_t row_start = i * (image_width - filter_width +1) +j;
60  std::size_t image_start = c * image_width * image_height + (i+i1) * image_width + j;
61  std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
62  for(std::size_t j1 = 0; j1 != filter_width; ++j1){
63  output()(row_start, col_start +j1) = image()(image_start + j1);
64  }
65  }
66  }
67  }
68  }
69 }
70 
71 template<class E1, class E2>
72 void im2mat_pad(
73  vector_expression<E1, cpu_tag> const& image,
74  matrix_expression<E2, cpu_tag>& output,
75  std::size_t num_channels,
76  std::size_t image_height,
77  std::size_t image_width,
78  std::size_t filter_height,
79  std::size_t filter_width,
80  std::size_t padding_height,
81  std::size_t padding_width
82 ){
83  std::size_t image_start1 = padding_height/2;
84  std::size_t image_end1 = image_height + image_start1;
85  std::size_t image_start2 = padding_width/2;
86  std::size_t image_end2 = image_width + image_start2;
87  std::size_t output_width = image_width - filter_width + 1 + padding_width;
88  std::size_t output_height = image_height - filter_height + 1 + padding_height;
89  //the order of loops is chosen such, that only very little changes of rows are performed
90  for(std::size_t c = 0; c != num_channels; ++c){//iterate over the channels
91  for(std::size_t i = 0; i != output_height; ++i){// iterate over row-positions in the output
92  for(std::size_t i1 = 0; i1 != filter_height; ++i1){//iterate over the the rows of the current filter
93  if(i1+i < image_start1 || i1+i >= image_end1){//special case: we are on the border above or below
94  for(std::size_t j = 0; j != output_width; ++j){//iterate over the column-position in the output
95  std::size_t row_start = i * output_width +j;
96  std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
97  for(std::size_t j1 = 0; j1 != filter_width; ++j1){
98  output()(row_start, col_start +j1) = 0;
99  }
100  }
101  continue;//no need to got on, we are done
102  }
103  for(std::size_t j = 0; j != output_width; ++j){//iterate over the column-position in the output
104  std::size_t row_start = i * output_width +j;
105  std::size_t image_start = c * image_width * image_height + (i+i1 - image_start1) * image_width + j-image_start2;
106  std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
107  for(std::size_t j1 = 0; j1 != filter_width; ++j1){
108  if(j+j1 < image_start2 || j+j1 >= image_end2)
109  output()(row_start, col_start + j1) = 0;
110  else
111  output()(row_start, col_start + j1) = image()(image_start + j1);
112  }
113  }
114  }
115  }
116  }
117 }
118 
119 
120 template<class E1, class E2, class M>
121 void conv2d(
122  vector_expression<E1, cpu_tag> const& image,
123  vector_expression<E2, cpu_tag> const& filter,
124  vector_expression<M, cpu_tag>& output,
125  std::size_t num_channels,
126  std::size_t num_filters,
127  std::size_t image_height,
128  std::size_t image_width,
129  std::size_t filter_height,
130  std::size_t filter_width,
131  std::size_t padding_height,
132  std::size_t padding_width
133 ){
134  typedef typename std::common_type<
135  typename E1::value_type, typename E2::value_type, typename M::value_type
136  >::type value_type;
137 
138  std::size_t output_rows_per_filter = (image_height - filter_height +1 + padding_height) * (image_width - filter_width +1 + padding_width);
139 
140  std::size_t filter_size = filter_width * filter_height * num_channels;
141 
142  REMORA_SIZE_CHECK(output().size() == num_filters * output_rows_per_filter);
143  REMORA_SIZE_CHECK(image().size() == num_channels * image_width * image_height);
144  REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size);
145 
146  //allocate storage and create temporary matrices
147  boost::alignment::aligned_allocator<value_type,64> allocator;
148  value_type* image_storage = allocator.allocate( output_rows_per_filter * filter_size);
149  value_type* filter_storage = allocator.allocate(num_filters * filter_size);
150  dense_matrix_adaptor<value_type, row_major, cpu_tag> image_transformed(image_storage,output_rows_per_filter, filter_size);
151  dense_matrix_adaptor<value_type, row_major, cpu_tag> filter_transformed(filter_storage, num_filters, filter_size);
152  dense_matrix_adaptor<value_type, row_major, cpu_tag> output_transformed(output(), num_filters, output_rows_per_filter);
153  //copy image to temporary storage
154  if(padding_height == 0 && padding_width == 0){
155  im2mat(image,image_transformed, num_channels, image_height, image_width, filter_height, filter_width);
156  }else{
157  im2mat_pad(image,image_transformed, num_channels, image_height, image_width, filter_height, filter_width, padding_height, padding_width);
158  }
159  //copy filters to temporary storage
160  for(std::size_t f = 0; f != num_filters; ++f){
161  for(std::size_t i = 0; i != filter_size; ++i){
162  filter_transformed(f,i) = filter()(f * filter_size + i);
163  }
164  }
165 
166  //do the computation
167  kernels::gemm(filter_transformed, trans(image_transformed), output_transformed, value_type(1.0));
168 
169  //deallocate storage
170  allocator.deallocate(image_storage,output_rows_per_filter * filter_size);
171  allocator.deallocate(filter_storage, num_filters * filter_size);
172 }
173 
174 }}
175 
176 #endif