Auto choose conv implementation (Tencent#1085)

* add relative README_CN.md; * obtain time cost with op->forward().
Bisonai · Jul 18, 2019 · 1ca4387 · 1ca4387
1 parent e9c890a
commit 1ca4387
Show file tree

Hide file tree

Showing 8 changed files with 359 additions and 23 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -91,6 +91,8 @@ elseif(NCNN_AVX2)
     else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
     endif()
+elseif(LINUX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fno-exceptions")
 endif()
 
 ##############################################

diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
@@ -137,6 +137,41 @@ int Convolution_arm::create_pipeline(const Option& opt)
         return 0;
     }
 
+    if (impl_type > 0)
+    {
+        int num_input = 0;
+        int kernel_size = 0;
+        switch(impl_type)
+        {
+            case 1:
+                // winograd
+                num_input = weight_data_size / 9 / num_output;
+                conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
+                break;
+            case 2:
+                // pointwise
+                num_input = weight_data_size / num_output;
+                conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
+                break;
+            case 3:
+                // im2col
+                kernel_size = kernel_w * kernel_h;
+                num_input = weight_data_size / kernel_size / num_output;
+                conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_size);
+                break;
+            case 4:
+                // direct
+                break;
+            case 5:
+                // conv3x3s2
+                num_input = weight_data_size / 9 / num_output;
+                conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
+            default:
+                return -1;
+        }
+        return 0;
+    }
+
     if (use_winograd3x3)
     {
         int num_input = weight_data_size / 9 / num_output;
@@ -596,28 +631,56 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     if (top_blob.empty())
         return -100;
 
-    if (use_winograd3x3 && w <= 120 && h <= 120)
-    {
-//         conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
-        conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
-    }
-    else if (use_sgemm1x1)
-    {
-        conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
-    }
-    else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    if (impl_type > 0)
     {
-        conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
-    }
-    else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        // engineering is magic.
+        switch(impl_type)
+        {
+            case 1:
+                conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
+                break;
+            case 2:
+                conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
+                break;
+            case 3:
+                conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
+                break;
+            case 4:
+                conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+                break;
+            case 5:
+                conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
+                break;
+            default:
+                return -1;
+        }
+
+    } else 
     {
-        if (outw >=8 && outh >=8)
-            conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
-        else
+        if (use_winograd3x3 && w <= 120 && h <= 120)
+        {
+//             conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
+            conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
+        }
+        else if (use_sgemm1x1)
+        {
+            conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
             conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
-    }     
-    else
-        conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+        }
+        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            if (outw >=8 && outh >=8)
+                conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
+            else
+                conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
+        }     
+        else
+            conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+    }
+
 
     if (activation)
     {

diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
@@ -45,6 +45,7 @@ int Convolution::load_param(const ParamDict& pd)
     int8_scale_term = pd.get(8, 0);
     activation_type = pd.get(9, 0);
     activation_params = pd.get(10, Mat());
+    impl_type = pd.get(15, 0);
 
     return 0;
 }

diff --git a/src/layer/convolution.h b/src/layer/convolution.h
@@ -74,6 +74,9 @@ class Convolution : public Layer
     // merge de/requantize op into convolution op
     std::vector<float> dequantize_scales;
     std::vector<float> requantize_scales;    
+
+    // implementation type, 0 means do not use auto pack model 
+    int impl_type;
 };
 
 } // namespace ncnn

diff --git a/toolchains/aarch64-linux-gnu.toolchain.cmake b/toolchains/aarch64-linux-gnu.toolchain.cmake
@@ -15,5 +15,5 @@ SET ( CMAKE_CXX_FLAGS "-std=c++11 -march=armv8-a -fopenmp ${CMAKE_CXX_FLAGS}" )
 
 # other settings
 add_definitions(-D__ARM_NEON)
-add_definitions(-D__ANDROID__)
-SET ( ANDROID true)
+add_definitions(-DLINUX)
+SET ( LINUX true)
diff --git a/tools/README_CN.md b/tools/README_CN.md
@@ -0,0 +1,60 @@
+## ncnn optimize: auto pack model 技术文档
+
+### 前置条件
+
+1. 请先准备一块 arm linux 开发板，直接用手机步骤会麻烦点。假设想把模型放到 qcom845 芯片手机上执行，尽量准备 845 开发板。若条件有限，大的架构(armv8还是armv7)一致也行，最终优化速度用 CPU 主频直接折算;
+
+2. 使用 ncnn/tools 下的工具，把训练好的模型转换为 ncnn 支持的格式，例如 ncnn.param 和 ncnn.bin;
+
+### 使用方法
+
+直接执行命令
+```
+ncnn optimize ncnn.param ncnn.bin out.param out.bin 0 data 227 227 3
+
+Input  [w h nc]: 227 227 3
+Kernel [w h nc]: 3 3 192
+Output [w h nc]: 113 113 64
+im2col cost 14.188ms
+direct cost 9.394ms
+conv3x3s2 cost 6.555ms
+conv1 use conv3x3s2
+
+Input  [w h nc]: 56 56 64
+Kernel [w h nc]: 1 1 1024
+Output [w h nc]: 56 56 16
+im2col cost 1.812ms
+direct cost 1.995ms
+fire2/squeeze1x1 use im2col
+
+Input  [w h nc]: 56 56 16
+Kernel [w h nc]: 1 1 1024
+Output [w h nc]: 56 56 64
+im2col cost 1.223ms
+direct cost 2.169ms
+fire2/expand1x1 use im2col
+
+Input  [w h nc]: 58 58 16
+Kernel [w h nc]: 3 3 1024
+Output [w h nc]: 56 56 64
+winograd cost 5.853ms
+im2col cost 10.480ms
+direct cost 6.752ms
+fire2/expand3x3 use winograd
+...
+```
+
+其中 data 是输入层的名字，由于常见的输入层只有一个，暂时只支持一个；
+227 227 3 是实际要使用的 WHC 格式的尺寸，尺寸不同最终选择的方案也不同。考虑到 N 在移动端推理没有任何应用场景，因此我们认为 N = 1。
+
+### 工作原理
+
+首先要认同一件事儿：卷积优化不是某种单一的方法就能搞定的，不存在“一招鲜吃遍天”。
+在这个认知基础上，每种方法（MEC/FFT/direct/winograd）在不同的情况（尺寸、内存、核数、能耗等）下都有各自的速度优势。
+至于哪种是最快的，实际跑一遍就知道。用黑盒处理黑盒，没必要用一堆判断条件。
+此原理保证了 auto pack model 后不会比之前慢。
+
+同理，想知道哪种FC/pooling/dwConv是最快的/能耗最低的，也可以用同样的方法。
+
+### 感谢
+最后感谢 up 主自 16 年底开源的 ncnn。
diff --git a/tools/ncnn2mem.cpp b/tools/ncnn2mem.cpp
@@ -143,7 +143,7 @@ static int dump_param(const char* parampath, const char* parambinpath, const cha
         fprintf(ip, "const int LAYER_%s = %d;\n", layer_name, i);
 
 //         layer->bottoms.resize(bottom_count);
-        for (int i=0; i<bottom_count; i++)
+        for (int j=0; j<bottom_count; j++)
         {
             char bottom_name[257];
             nscan = fscanf(fp, "%256s", bottom_name);
@@ -161,7 +161,7 @@ static int dump_param(const char* parampath, const char* parambinpath, const cha
         }
 
 //         layer->tops.resize(top_count);
-        for (int i=0; i<top_count; i++)
+        for (int j=0; j<top_count; j++)
         {
             char blob_name[257];
             nscan = fscanf(fp, "%256s", blob_name);