conv2d.cpp
Go to the documentation of this file.
1 #define SHARK_USE_SIMD
2 #include <shark/LinAlg/BLAS/blas.h>
4 #include <shark/Core/Timer.h>
5 #include <iostream>
6 using namespace shark;
7 using namespace std;
8 
9 template<class E1, class E2>
10 void benchmark(
11  blas::matrix_expression<E1, blas::cpu_tag> const& image,
12  blas::matrix_expression<E2, blas::cpu_tag> const& filter,
13  std::size_t num_channels,
14  std::size_t num_filters
15 ){
16  std::size_t filter_size = filter().size2();
17  std::size_t image_size1 = image().size1()/num_channels;
18  std::size_t image_size2 = image().size2();
19  std::size_t output_size1 = image_size1 - filter_size +1;
20  std::size_t output_size2 = image_size2 - filter_size +1;
21  typedef typename E1::value_type value_type;
22 
23  blas::matrix<value_type> out(output_size1 * num_filters, output_size2 ,0.0);
24  double minOptTime = std::numeric_limits<double>::max();
25  for(std::size_t i = 0; i != 20; ++i){
26  Timer time;
27  blas::kernels::conv2d(image,filter,out, num_channels, num_filters);
28  minOptTime = min(minOptTime,time.stop());
29  }
30 
31  double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels;
32  double flops = mults /1024/1024/minOptTime;
33 
34  std::cout<<output_size1<<"\t"<<filter_size<<"\t"<<num_channels<<"\t"<< num_filters<<"\t";
35  std::cout<<"\t"<<flops<< std::endl;
36 }
37 
38 
39 int main(int argc, char **argv) {
40  std::cout<<"Flops"<<std::endl;
41  std::size_t num_channels = 8;
42  std::size_t num_outputs = 16;
43  std::cout<<"performance float"<<std::endl;
44  for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
45  for(std::size_t iter = 0; iter != 6; ++iter){
46  std::size_t sizeOut1 = (3+16 * 2<<iter);
47  std::size_t sizeOut2 = (3+16 * 2<<iter);
48  std::size_t sizeIm1 = sizeOut1 + filterSize-1;
49  std::size_t sizeIm2 = sizeOut2 + filterSize-1;
50 
51  blas::matrix<float> image(num_channels * sizeIm1 , sizeIm2);
52  blas::matrix<float> filter(num_channels * num_outputs * filterSize, filterSize);
53 
54  for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
55  for(std::size_t j = 0; j != sizeIm2; ++j){
56  image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
57  }
58  }
59  for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
60  for(std::size_t j = 0; j != filterSize; ++j){
61  filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
62  }
63  }
64 
65  benchmark(image,filter,num_channels,num_outputs);
66  }
67  }
68  num_outputs = 8;
69  std::cout<<"performance double"<<std::endl;
70  for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
71  for(std::size_t iter = 0; iter != 6; ++iter){
72  std::size_t sizeOut1 = (3+16 * 2<<iter);
73  std::size_t sizeOut2 = (3+16 * 2<<iter);
74  std::size_t sizeIm1 = sizeOut1 + filterSize-1;
75  std::size_t sizeIm2 = sizeOut2 + filterSize-1;
76 
77  blas::matrix<double> image(num_channels * sizeIm1 , sizeIm2);
78  blas::matrix<double> filter(num_channels * num_outputs * filterSize, filterSize);
79 
80  for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
81  for(std::size_t j = 0; j != sizeIm2; ++j){
82  image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
83  }
84  }
85  for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
86  for(std::size_t j = 0; j != filterSize; ++j){
87  filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
88  }
89  }
90 
91  benchmark(image,filter,num_channels,num_outputs);
92  }
93  }
94 }