31 #ifndef REMORA_KERNELS_DEFAULT_Conv2D_HPP 32 #define REMORA_KERNELS_DEFAULT_Conv2D_HPP 35 #include "../gemm.hpp" 36 #include <type_traits> 37 namespace remora{
namespace bindings {
44 template<
class E1,
class E2>
46 vector_expression<E1, cpu_tag>
const& image,
47 matrix_expression<E2, cpu_tag>& output,
48 std::size_t num_channels,
49 std::size_t image_height,
50 std::size_t image_width,
51 std::size_t filter_height,
52 std::size_t filter_width
55 for(std::size_t c = 0; c != num_channels; ++c){
56 for(std::size_t i = 0; i != image_height - filter_height +1; ++i){
57 for(std::size_t i1 = 0; i1 != filter_height; ++i1){
58 for(std::size_t j = 0; j != image_width - filter_width +1; ++j){
59 std::size_t row_start = i * (image_width - filter_width +1) +j;
60 std::size_t image_start = c * image_width * image_height + (i+i1) * image_width + j;
61 std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
62 for(std::size_t j1 = 0; j1 != filter_width; ++j1){
63 output()(row_start, col_start +j1) = image()(image_start + j1);
71 template<
class E1,
class E2>
73 vector_expression<E1, cpu_tag>
const& image,
74 matrix_expression<E2, cpu_tag>& output,
75 std::size_t num_channels,
76 std::size_t image_height,
77 std::size_t image_width,
78 std::size_t filter_height,
79 std::size_t filter_width,
80 std::size_t padding_height,
81 std::size_t padding_width
83 std::size_t image_start1 = padding_height/2;
84 std::size_t image_end1 = image_height + image_start1;
85 std::size_t image_start2 = padding_width/2;
86 std::size_t image_end2 = image_width + image_start2;
87 std::size_t output_width = image_width - filter_width + 1 + padding_width;
88 std::size_t output_height = image_height - filter_height + 1 + padding_height;
90 for(std::size_t c = 0; c != num_channels; ++c){
91 for(std::size_t i = 0; i != output_height; ++i){
92 for(std::size_t i1 = 0; i1 != filter_height; ++i1){
93 if(i1+i < image_start1 || i1+i >= image_end1){
94 for(std::size_t j = 0; j != output_width; ++j){
95 std::size_t row_start = i * output_width +j;
96 std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
97 for(std::size_t j1 = 0; j1 != filter_width; ++j1){
98 output()(row_start, col_start +j1) = 0;
103 for(std::size_t j = 0; j != output_width; ++j){
104 std::size_t row_start = i * output_width +j;
105 std::size_t image_start = c * image_width * image_height + (i+i1 - image_start1) * image_width + j-image_start2;
106 std::size_t col_start = c * filter_width * filter_height + i1 * filter_width;
107 for(std::size_t j1 = 0; j1 != filter_width; ++j1){
108 if(j+j1 < image_start2 || j+j1 >= image_end2)
109 output()(row_start, col_start + j1) = 0;
111 output()(row_start, col_start + j1) = image()(image_start + j1);
120 template<
class E1,
class E2,
class M>
122 vector_expression<E1, cpu_tag>
const& image,
123 vector_expression<E2, cpu_tag>
const& filter,
124 vector_expression<M, cpu_tag>& output,
125 std::size_t num_channels,
126 std::size_t num_filters,
127 std::size_t image_height,
128 std::size_t image_width,
129 std::size_t filter_height,
130 std::size_t filter_width,
131 std::size_t padding_height,
132 std::size_t padding_width
134 typedef typename std::common_type<
135 typename E1::value_type,
typename E2::value_type,
typename M::value_type
138 std::size_t output_rows_per_filter = (image_height - filter_height +1 + padding_height) * (image_width - filter_width +1 + padding_width);
140 std::size_t filter_size = filter_width * filter_height * num_channels;
142 REMORA_SIZE_CHECK(output().size() == num_filters * output_rows_per_filter);
143 REMORA_SIZE_CHECK(image().size() == num_channels * image_width * image_height);
144 REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size);
147 boost::alignment::aligned_allocator<value_type,64> allocator;
148 value_type* image_storage = allocator.allocate( output_rows_per_filter * filter_size);
149 value_type* filter_storage = allocator.allocate(num_filters * filter_size);
150 dense_matrix_adaptor<value_type, row_major, cpu_tag> image_transformed(image_storage,output_rows_per_filter, filter_size);
151 dense_matrix_adaptor<value_type, row_major, cpu_tag> filter_transformed(filter_storage, num_filters, filter_size);
152 dense_matrix_adaptor<value_type, row_major, cpu_tag> output_transformed(output(), num_filters, output_rows_per_filter);
154 if(padding_height == 0 && padding_width == 0){
155 im2mat(image,image_transformed, num_channels, image_height, image_width, filter_height, filter_width);
157 im2mat_pad(image,image_transformed, num_channels, image_height, image_width, filter_height, filter_width, padding_height, padding_width);
160 for(std::size_t f = 0; f != num_filters; ++f){
161 for(std::size_t i = 0; i != filter_size; ++i){
162 filter_transformed(f,i) = filter()(f * filter_size + i);
167 kernels::gemm(filter_transformed, trans(image_transformed), output_transformed, value_type(1.0));
170 allocator.deallocate(image_storage,output_rows_per_filter * filter_size);
171 allocator.deallocate(filter_storage, num_filters * filter_size);