2 #include <shark/LinAlg/BLAS/blas.h> 9 template<
class E1,
class E2>
11 blas::matrix_expression<E1, blas::cpu_tag>
const& image,
12 blas::matrix_expression<E2, blas::cpu_tag>
const& filter,
13 std::size_t num_channels,
14 std::size_t num_filters
16 std::size_t filter_size = filter().size2();
17 std::size_t image_size1 = image().size1()/num_channels;
18 std::size_t image_size2 = image().size2();
19 std::size_t output_size1 = image_size1 - filter_size +1;
20 std::size_t output_size2 = image_size2 - filter_size +1;
21 typedef typename E1::value_type value_type;
23 blas::matrix<value_type> out(output_size1 * num_filters, output_size2 ,0.0);
24 double minOptTime = std::numeric_limits<double>::max();
25 for(std::size_t i = 0; i != 20; ++i){
27 blas::kernels::conv2d(image,filter,out, num_channels, num_filters);
28 minOptTime = min(minOptTime,time.
stop());
31 double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels;
32 double flops = mults /1024/1024/minOptTime;
34 std::cout<<output_size1<<
"\t"<<filter_size<<
"\t"<<num_channels<<
"\t"<< num_filters<<
"\t";
35 std::cout<<
"\t"<<flops<< std::endl;
39 int main(
int argc,
char **argv) {
40 std::cout<<
"Flops"<<std::endl;
41 std::size_t num_channels = 8;
42 std::size_t num_outputs = 16;
43 std::cout<<
"performance float"<<std::endl;
44 for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
45 for(std::size_t iter = 0; iter != 6; ++iter){
46 std::size_t sizeOut1 = (3+16 * 2<<iter);
47 std::size_t sizeOut2 = (3+16 * 2<<iter);
48 std::size_t sizeIm1 = sizeOut1 + filterSize-1;
49 std::size_t sizeIm2 = sizeOut2 + filterSize-1;
51 blas::matrix<float> image(num_channels * sizeIm1 , sizeIm2);
52 blas::matrix<float> filter(num_channels * num_outputs * filterSize, filterSize);
54 for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
55 for(std::size_t j = 0; j != sizeIm2; ++j){
56 image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
59 for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
60 for(std::size_t j = 0; j != filterSize; ++j){
61 filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
65 benchmark(image,filter,num_channels,num_outputs);
69 std::cout<<
"performance double"<<std::endl;
70 for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
71 for(std::size_t iter = 0; iter != 6; ++iter){
72 std::size_t sizeOut1 = (3+16 * 2<<iter);
73 std::size_t sizeOut2 = (3+16 * 2<<iter);
74 std::size_t sizeIm1 = sizeOut1 + filterSize-1;
75 std::size_t sizeIm2 = sizeOut2 + filterSize-1;
77 blas::matrix<double> image(num_channels * sizeIm1 , sizeIm2);
78 blas::matrix<double> filter(num_channels * num_outputs * filterSize, filterSize);
80 for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
81 for(std::size_t j = 0; j != sizeIm2; ++j){
82 image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
85 for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
86 for(std::size_t j = 0; j != filterSize; ++j){
87 filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
91 benchmark(image,filter,num_channels,num_outputs);