Skip to content

Commit

Permalink
Kill the bits and gain the speed?
Browse files Browse the repository at this point in the history
  • Loading branch information
martinkersner committed Nov 30, 2019
1 parent f11b772 commit 2b17ac9
Show file tree
Hide file tree
Showing 108 changed files with 625 additions and 346 deletions.
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ option(NCNN_VULKAN "vulkan compute support" OFF)
option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
option(NCNN_AVX2 "optimize x86 platform with avx2" OFF)
option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
option(BISONAI_DEBUG "print debug information" OFF)
option(BISONAI_KILL_THE_BITS "enable kill the bits" OFF)

if(ANDROID OR IOS)
option(NCNN_DISABLE_RTTI "disable rtti" ON)
Expand All @@ -52,6 +54,6 @@ endif()
# add_subdirectory(examples)
add_subdirectory(benchmark)
add_subdirectory(src)
if(NCNN_BUILD_TOOLS)
add_subdirectory(tools)
endif()
# if(NCNN_BUILD_TOOLS)
# add_subdirectory(tools)
# endif()
44 changes: 42 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,48 @@
This repository is accompanying a blog post [Kill the bits and gain the speed?](https://bisonai.com/2019/11/28/kill-the-bits/).

In the root directory, you can find two compilation scripts `build_kill_the_bits_aarch64.sh` and `build_kill_the_bits_armv7.sh` for `arm64` and `armv7`, respectively.
Experimental layer definitions are defined in `experiments/conv3x3` directory.

## Experiments
1. Make a directory at `/data/local/tmp/kill-the-bits` on your mobile device.

```bash
adb shell
cd /data/local/tmp
mkdir kill-the-bits
```

2. Compile inference engine.

```bash
./build_kill_the_bits_aarch64.sh
# ./build_kill_the_bits_armv7.sh
```
3. Copy `benchncnn` to your device.

```bash
adb push build-android-aarch64/benchmark/benchncnn /data/local/tmp/kill-the-bits
#adb push build-android-armv7/benchmark/benchncnn /data/local/tmp/kill-the-bits
```

4. Copy a directory `experiments/conv3x3` to `/data/local/tmp`

```bash
adb push experiments/conv3x3 /data/local/tmp/kill-the-bits
```

5. Launch experiments.

```
./benchncnn 7
# ./benchncnn 14
# ./benchncnn 28
```

![](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)
# ncnn

[![License](https://img.shields.io/badge/license-BSD--3--Clause-blue.svg)](https://raw.githubusercontent.com/Tencent/ncnn/master/LICENSE.txt)
[![License](https://img.shields.io/badge/license-BSD--3--Clause-blue.svg)](https://raw.githubusercontent.com/Tencent/ncnn/master/LICENSE.txt)
[![Build Status](https://travis-ci.org/Tencent/ncnn.svg?branch=master)](https://travis-ci.org/Tencent/ncnn)
[![Coverage Status](https://coveralls.io/repos/github/Tencent/ncnn/badge.svg?branch=master)](https://coveralls.io/github/Tencent/ncnn?branch=master)

Expand Down Expand Up @@ -155,4 +196,3 @@ ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架
### License

BSD 3 Clause

179 changes: 42 additions & 137 deletions benchmark/benchncnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class DataReaderFromEmpty : public ncnn::DataReader
virtual int read(void* buf, int size) const { memset(buf, 0, size); return size; }
};

static int g_warmup_loop_count = 8;
static int g_warmup_loop_count = 800; // BISONAI
static int g_loop_count = 4;

static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
Expand Down Expand Up @@ -98,7 +98,7 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op
#ifdef _WIN32
Sleep(10 * 1000);
#else
sleep(10);
// sleep(10); // BISONAI
#endif

ncnn::Mat out;
Expand All @@ -111,56 +111,52 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op
ex.extract("output", out);
}

double time_min = DBL_MAX;
double time_max = -DBL_MAX;
double time_avg = 0;
std::vector<double> times;

for (int i=0; i<g_loop_count; i++)
{
double start = ncnn::get_current_time();
auto start = std::chrono::high_resolution_clock::now();

{
ncnn::Extractor ex = net.create_extractor();
ex.input("data", in);
ex.extract("output", out);
}

double end = ncnn::get_current_time();
auto end = std::chrono::high_resolution_clock::now();

double time = end - start;
std::chrono::duration<double> time = end-start;

time_min = std::min(time_min, time);
time_max = std::max(time_max, time);
time_avg += time;
times.push_back(double(std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()));
}

time_avg /= g_loop_count;

fprintf(stderr, "%20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg);
printf("%20s ", comment);
for(const auto & t : times)
printf("%f ", t);
printf("\n");
}

int main(int argc, char** argv)
{
int loop_count = 4;
int num_threads = ncnn::get_cpu_count();
int loop_count = 200;
int num_threads = 1;
int powersave = 0;
int gpu_device = -1;
int experiment_type = 7;

if (argc >= 2)
{
loop_count = atoi(argv[1]);
experiment_type = atoi(argv[1]);
if (experiment_type != 7 && experiment_type != 14 && experiment_type != 28)
{
printf("The only available experiments are for 7x7, 14x14, or 28x28 input sizes.\n");
printf("Please select one of those: 7, 14, or 28.");
exit(1);
}
}
if (argc >= 3)
{
num_threads = atoi(argv[2]);
}
if (argc >= 4)
{
powersave = atoi(argv[3]);
}
if (argc >= 5)
{
gpu_device = atoi(argv[4]);
loop_count = atoi(argv[2]);
}

bool use_vulkan_compute = gpu_device != -1;
Expand Down Expand Up @@ -202,7 +198,8 @@ int main(int argc, char** argv)
opt.use_fp16_arithmetic = true;
opt.use_int8_storage = true;
opt.use_int8_arithmetic = true;
opt.use_packing_layout = true;
// BISONAI: Convolution using packing on arm64 seems to be significantly slower.
opt.use_packing_layout = false;

ncnn::set_cpu_powersave(powersave);

Expand All @@ -214,122 +211,30 @@ int main(int argc, char** argv)
fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());
fprintf(stderr, "gpu_device = %d\n", gpu_device);

// run
benchmark("squeezenet", ncnn::Mat(227, 227, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
{
opt.use_packing_layout = false;
benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt);
opt.use_packing_layout = true;
}

benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
{
opt.use_packing_layout = false;
benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt);
opt.use_packing_layout = true;
}

benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt);

// #if NCNN_VULKAN
// if (!use_vulkan_compute)
// #endif // NCNN_VULKAN
// benchmark("mobilenet_v2_int8", ncnn::Mat(224, 224, 3), opt);

benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3), opt);

benchmark("shufflenet", ncnn::Mat(224, 224, 3), opt);

benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3), opt);

benchmark("mnasnet", ncnn::Mat(224, 224, 3), opt);

benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3), opt);

benchmark("googlenet", ncnn::Mat(224, 224, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
{
opt.use_packing_layout = false;
benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt);
opt.use_packing_layout = true;
}

benchmark("resnet18", ncnn::Mat(224, 224, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
if (experiment_type == 7)
{
opt.use_packing_layout = false;
benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt);
opt.use_packing_layout = true;
// 7x7
benchmark("conv3x3/conv2x32x3x3_2x32x7x7", ncnn::Mat(7, 7, 512), opt);
benchmark("conv3x3/conv2x64x3x3_2x64x7x7", ncnn::Mat(7, 7, 512), opt);
benchmark("conv3x3/conv2x128x3x3_2x128x7x7", ncnn::Mat(7, 7, 512), opt);
benchmark("conv3x3/conv2x256x3x3_2x256x7x7", ncnn::Mat(7, 7, 512), opt);
}

benchmark("alexnet", ncnn::Mat(227, 227, 3), opt);

benchmark("vgg16", ncnn::Mat(224, 224, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
else if (experiment_type == 14)
{
opt.use_packing_layout = false;
benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt);
opt.use_packing_layout = true;
// 14x14
benchmark("conv3x3/conv2x16x3x3_2x16x14x14", ncnn::Mat(14, 14, 256), opt);
benchmark("conv3x3/conv2x32x3x3_2x32x14x14", ncnn::Mat(14, 14, 256), opt);
benchmark("conv3x3/conv2x64x3x3_2x64x14x14", ncnn::Mat(14, 14, 256), opt);
benchmark("conv3x3/conv2x128x3x3_2x128x14x14", ncnn::Mat(14, 14, 256), opt);
}

benchmark("resnet50", ncnn::Mat(224, 224, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
else if (experiment_type == 28)
{
opt.use_packing_layout = false;
benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt);
opt.use_packing_layout = true;
// 28x28
benchmark("conv3x3/conv2x8x3x3_2x8x28x28", ncnn::Mat(28, 28, 128), opt);
benchmark("conv3x3/conv2x16x3x3_2x16x28x28", ncnn::Mat(28, 28, 128), opt);
benchmark("conv3x3/conv2x32x3x3_2x32x28x28", ncnn::Mat(28, 28, 128), opt);
benchmark("conv3x3/conv2x64x3x3_2x64x28x28", ncnn::Mat(28, 28, 128), opt);
}

benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
{
opt.use_packing_layout = false;
benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
opt.use_packing_layout = true;
}

benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt);

#if NCNN_VULKAN
if (!use_vulkan_compute)
#endif // NCNN_VULKAN
{
opt.use_packing_layout = false;
benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
opt.use_packing_layout = true;
}

benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt);

benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3), opt);

#if NCNN_VULKAN
delete g_blob_vkallocator;
delete g_staging_vkallocator;
#endif // NCNN_VULKAN

return 0;
}
14 changes: 14 additions & 0 deletions build_kill_the_bits_aarch64.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

##### android aarch64
mkdir -p build-android-aarch64
pushd build-android-aarch64
cmake \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-21 \
-DBISONAI_KILL_THE_BITS=ON \
..
make -j4
make install
popd
15 changes: 15 additions & 0 deletions build_kill_the_bits_armv7.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash

##### android armv7
mkdir -p build-android-armv7
pushd build-android-armv7
cmake \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI="armeabi-v7a" \
-DANDROID_ARM_NEON=ON \
-DANDROID_PLATFORM=android-19 \
-DBISONAI_KILL_THE_BITS=ON \
..
make -j4
make install
popd
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x128x3x3_2x128x14x14.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=14 1=14 2=256
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=4608 19=256 20=128 21=14
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x128x3x3_2x128x7x7.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=7 1=7 2=512
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=9216 19=512 20=128 21=7
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x16x3x3_2x16x14x14.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=14 1=14 2=256
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=4608 19=256 20=16 21=14
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x16x3x3_2x16x28x28.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=28 1=28 2=128
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=2304 19=128 20=16 21=28
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x256x3x3_2x256x7x7.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=7 1=7 2=512
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=9216 19=512 20=256 21=7
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x32x3x3_2x32x14x14.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=14 1=14 2=256
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=4608 19=256 20=32 21=14
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x32x3x3_2x32x28x28.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=28 1=28 2=128
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=2304 19=128 20=32 21=28
4 changes: 4 additions & 0 deletions experiments/conv3x3/conv2x32x3x3_2x32x7x7.param
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
7767517
2 2
Input data 0 1 data 0=7 1=7 2=512
Convolution conv 1 1 data output 0=2 1=3 11=3 2=1 12=1 3=1 13=1 4=0 5=1 6=9216 19=512 20=32 21=7
Loading

0 comments on commit 2b17ac9

Please sign in to comment.