Skip to content

Commit

Permalink
Auto choose conv implementation (Tencent#1085)
Browse files Browse the repository at this point in the history
* add relative README_CN.md;
* obtain time cost with op->forward().
  • Loading branch information
tpoisonooo authored and nihui committed Jul 18, 2019
1 parent e9c890a commit 1ca4387
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ elseif(NCNN_AVX2)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
endif()
elseif(LINUX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fno-exceptions")
endif()

##############################################
Expand Down
101 changes: 82 additions & 19 deletions src/layer/arm/convolution_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,41 @@ int Convolution_arm::create_pipeline(const Option& opt)
return 0;
}

if (impl_type > 0)
{
int num_input = 0;
int kernel_size = 0;
switch(impl_type)
{
case 1:
// winograd
num_input = weight_data_size / 9 / num_output;
conv3x3s1_winograd64_transform_kernel_neon5(weight_data, weight_3x3_winograd64_data, num_input, num_output);
break;
case 2:
// pointwise
num_input = weight_data_size / num_output;
conv1x1s1_sgemm_transform_kernel_neon(weight_data, weight_1x1_sgemm_data, num_input, num_output);
break;
case 3:
// im2col
kernel_size = kernel_w * kernel_h;
num_input = weight_data_size / kernel_size / num_output;
conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_size);
break;
case 4:
// direct
break;
case 5:
// conv3x3s2
num_input = weight_data_size / 9 / num_output;
conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
default:
return -1;
}
return 0;
}

if (use_winograd3x3)
{
int num_input = weight_data_size / 9 / num_output;
Expand Down Expand Up @@ -596,28 +631,56 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
if (top_blob.empty())
return -100;

if (use_winograd3x3 && w <= 120 && h <= 120)
{
// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
}
else if (use_sgemm1x1)
{
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
if (impl_type > 0)
{
conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
// engineering is magic.
switch(impl_type)
{
case 1:
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
break;
case 2:
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
break;
case 3:
conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
break;
case 4:
conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
break;
case 5:
conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
break;
default:
return -1;
}

} else
{
if (outw >=8 && outh >=8)
conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
else
if (use_winograd3x3 && w <= 120 && h <= 120)
{
// conv3x3s1_winograd64_neon4(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
conv3x3s1_winograd64_neon5(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
}
else if (use_sgemm1x1)
{
conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
}
else
conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
if (outw >=8 && outh >=8)
conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);
else
conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
}
else
conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
}


if (activation)
{
Expand Down
1 change: 1 addition & 0 deletions src/layer/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ int Convolution::load_param(const ParamDict& pd)
int8_scale_term = pd.get(8, 0);
activation_type = pd.get(9, 0);
activation_params = pd.get(10, Mat());
impl_type = pd.get(15, 0);

return 0;
}
Expand Down
3 changes: 3 additions & 0 deletions src/layer/convolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class Convolution : public Layer
// merge de/requantize op into convolution op
std::vector<float> dequantize_scales;
std::vector<float> requantize_scales;

// implementation type, 0 means do not use auto pack model
int impl_type;
};

} // namespace ncnn
Expand Down
4 changes: 2 additions & 2 deletions toolchains/aarch64-linux-gnu.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ SET ( CMAKE_CXX_FLAGS "-std=c++11 -march=armv8-a -fopenmp ${CMAKE_CXX_FLAGS}" )

# other settings
add_definitions(-D__ARM_NEON)
add_definitions(-D__ANDROID__)
SET ( ANDROID true)
add_definitions(-DLINUX)
SET ( LINUX true)
60 changes: 60 additions & 0 deletions tools/README_CN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
## ncnn optimize: auto pack model 技术文档

### 前置条件

1. 请先准备一块 arm linux 开发板,直接用手机步骤会麻烦点。假设想把模型放到 qcom845 芯片手机上执行,尽量准备 845 开发板。若条件有限,大的架构(armv8还是armv7)一致也行,最终优化速度用 CPU 主频直接折算;

2. 使用 ncnn/tools 下的工具,把训练好的模型转换为 ncnn 支持的格式,例如 ncnn.param 和 ncnn.bin;

### 使用方法

直接执行命令
```
ncnn optimize ncnn.param ncnn.bin out.param out.bin 0 data 227 227 3
Input [w h nc]: 227 227 3
Kernel [w h nc]: 3 3 192
Output [w h nc]: 113 113 64
im2col cost 14.188ms
direct cost 9.394ms
conv3x3s2 cost 6.555ms
conv1 use conv3x3s2
Input [w h nc]: 56 56 64
Kernel [w h nc]: 1 1 1024
Output [w h nc]: 56 56 16
im2col cost 1.812ms
direct cost 1.995ms
fire2/squeeze1x1 use im2col
Input [w h nc]: 56 56 16
Kernel [w h nc]: 1 1 1024
Output [w h nc]: 56 56 64
im2col cost 1.223ms
direct cost 2.169ms
fire2/expand1x1 use im2col
Input [w h nc]: 58 58 16
Kernel [w h nc]: 3 3 1024
Output [w h nc]: 56 56 64
winograd cost 5.853ms
im2col cost 10.480ms
direct cost 6.752ms
fire2/expand3x3 use winograd
...
```

其中 data 是输入层的名字,由于常见的输入层只有一个,暂时只支持一个;
227 227 3 是实际要使用的 WHC 格式的尺寸,尺寸不同最终选择的方案也不同。考虑到 N 在移动端推理没有任何应用场景,因此我们认为 N = 1。

### 工作原理

首先要认同一件事儿:卷积优化不是某种单一的方法就能搞定的,不存在“一招鲜吃遍天”。
在这个认知基础上,每种方法(MEC/FFT/direct/winograd)在不同的情况(尺寸、内存、核数、能耗等)下都有各自的速度优势。
至于哪种是最快的,实际跑一遍就知道。用黑盒处理黑盒,没必要用一堆判断条件。
此原理保证了 auto pack model 后不会比之前慢。

同理,想知道哪种FC/pooling/dwConv是最快的/能耗最低的,也可以用同样的方法。

### 感谢
最后感谢 up 主自 16 年底开源的 ncnn。
4 changes: 2 additions & 2 deletions tools/ncnn2mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ static int dump_param(const char* parampath, const char* parambinpath, const cha
fprintf(ip, "const int LAYER_%s = %d;\n", layer_name, i);

// layer->bottoms.resize(bottom_count);
for (int i=0; i<bottom_count; i++)
for (int j=0; j<bottom_count; j++)
{
char bottom_name[257];
nscan = fscanf(fp, "%256s", bottom_name);
Expand All @@ -161,7 +161,7 @@ static int dump_param(const char* parampath, const char* parambinpath, const cha
}

// layer->tops.resize(top_count);
for (int i=0; i<top_count; i++)
for (int j=0; j<top_count; j++)
{
char blob_name[257];
nscan = fscanf(fp, "%256s", blob_name);
Expand Down
Loading

0 comments on commit 1ca4387

Please sign in to comment.