Androidonnxfork
commited on
Commit
•
8b7c501
1
Parent(s):
842b645
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .bazelrc +57 -0
- .clang-format +34 -0
- .gitattributes +2 -0
- .github/workflows/build.yml +207 -0
- .gitignore +34 -14
- BUILD.bazel +0 -0
- CMakeLists.txt +0 -0
- CONTRIBUTING.md +28 -0
- LICENSE +31 -0
- README.md +130 -12
- WORKSPACE +89 -0
- bench/abs.cc +277 -0
- bench/average-pooling.cc +429 -0
- bench/bankers-rounding.cc +277 -0
- bench/batch-matrix-multiply.cc +259 -0
- bench/bf16-gemm.cc +244 -0
- bench/bgemm.h +70 -0
- bench/ceiling.cc +277 -0
- bench/channel-shuffle.cc +340 -0
- bench/conv.h +852 -0
- bench/convert.cc +1339 -0
- bench/convolution.cc +1768 -0
- bench/cs16-bfly4.cc +116 -0
- bench/cs16-fftr.cc +73 -0
- bench/cs16-vsquareabs.cc +127 -0
- bench/dconv.h +54 -0
- bench/deconvolution.cc +575 -0
- bench/dwconv.h +368 -0
- bench/elu.cc +460 -0
- bench/end2end.cc +201 -0
- bench/end2end.h +37 -0
- bench/f16-conv-hwc2chw.cc +130 -0
- bench/f16-dwconv-e2e.cc +736 -0
- bench/f16-dwconv.cc +795 -0
- bench/f16-dwconv2d-chw.cc +496 -0
- bench/f16-f32-vcvt.cc +414 -0
- bench/f16-f32acc-gemm.cc +162 -0
- bench/f16-f32acc-igemm.cc +214 -0
- bench/f16-f32acc-rsum.cc +140 -0
- bench/f16-gavgpool-cw.cc +77 -0
- bench/f16-gemm-e2e.cc +452 -0
- bench/f16-gemm.cc +513 -0
- bench/f16-igemm.cc +588 -0
- bench/f16-raddstoreexpminusmax.cc +387 -0
- bench/f16-rsum.cc +101 -0
- bench/f16-spmm.cc +247 -0
- bench/f16-velu.cc +104 -0
- bench/f16-vsigmoid.cc +319 -0
- bench/f16-vsqrt.cc +121 -0
- bench/f16-vtanh.cc +807 -0
.bazelrc
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Basic build settings
|
2 |
+
build --jobs 128
|
3 |
+
build --cxxopt='-std=gnu++14'
|
4 |
+
|
5 |
+
# Sets the default Apple platform to macOS.
|
6 |
+
build --apple_platform_type=macos
|
7 |
+
|
8 |
+
# Android configs.
|
9 |
+
build:android --crosstool_top=//external:android/crosstool
|
10 |
+
build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
|
11 |
+
build:android --linkopt=-ldl
|
12 |
+
build:android --linkopt=-Wl,--gc-sections
|
13 |
+
|
14 |
+
build:android_arm --config=android
|
15 |
+
build:android_arm --cpu=armeabi-v7a
|
16 |
+
build:android_arm --fat_apk_cpu=armeabi-v7a
|
17 |
+
|
18 |
+
build:android_arm64 --config=android
|
19 |
+
build:android_arm64 --cpu=arm64-v8a
|
20 |
+
build:android_arm64 --fat_apk_cpu=arm64-v8a
|
21 |
+
|
22 |
+
# iOS configs.
|
23 |
+
build:ios --apple_platform_type=ios
|
24 |
+
|
25 |
+
build:ios_i386 --config=ios
|
26 |
+
build:ios_i386 --cpu=ios_i386
|
27 |
+
build:ios_i386 --watchos_cpus=i386
|
28 |
+
|
29 |
+
build:ios_x86_64 --config=ios
|
30 |
+
build:ios_x86_64 --cpu=ios_x86_64
|
31 |
+
build:ios_x86_64 --watchos_cpus=i386
|
32 |
+
|
33 |
+
build:ios_armv7 --config=ios
|
34 |
+
build:ios_armv7 --cpu=ios_armv7
|
35 |
+
build:ios_armv7 --watchos_cpus=armv7k
|
36 |
+
|
37 |
+
build:ios_arm64 --config=ios
|
38 |
+
build:ios_arm64 --cpu=ios_arm64
|
39 |
+
build:ios_arm64 --watchos_cpus=armv7k
|
40 |
+
|
41 |
+
build:ios_arm64e --config=ios
|
42 |
+
build:ios_arm64e --cpu=ios_arm64e
|
43 |
+
build:ios_arm64e --watchos_cpus=armv7k
|
44 |
+
|
45 |
+
build:ios_sim_arm64 --config=ios
|
46 |
+
build:ios_sim_arm64 --cpu=ios_sim_arm64
|
47 |
+
build:ios_sim_arm64 --watchos_cpus=armv7k
|
48 |
+
|
49 |
+
build:ios_fat --config=ios
|
50 |
+
build:ios_fat --ios_multi_cpus=armv7,arm64
|
51 |
+
build:ios_fat --watchos_cpus=armv7k
|
52 |
+
|
53 |
+
# macOS configs.
|
54 |
+
build:macos --apple_platform_type=macos
|
55 |
+
|
56 |
+
build:macos_arm64 --config=macos
|
57 |
+
build:macos_arm64 --cpu=darwin_arm64
|
.clang-format
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AllowShortFunctionsOnASingleLine: Inline
|
2 |
+
PackConstructorInitializers: Never
|
3 |
+
ColumnLimit: 120
|
4 |
+
AlignAfterOpenBracket: AlwaysBreak
|
5 |
+
BinPackParameters: false
|
6 |
+
AllowAllParametersOfDeclarationOnNextLine: true
|
7 |
+
BreakBeforeBraces: Stroustrup
|
8 |
+
SpaceAfterCStyleCast: true
|
9 |
+
PointerAlignment: Left
|
10 |
+
ForEachMacros: ['XNN_UNPREDICTABLE', 'XNN_LIKELY', 'XNN_UNLIKELY']
|
11 |
+
IfMacros: ['IF']
|
12 |
+
IndentCaseLabels: true
|
13 |
+
ContinuationIndentWidth: 2
|
14 |
+
SpaceBeforeParens: Custom
|
15 |
+
SpaceBeforeParensOptions:
|
16 |
+
AfterControlStatements: true
|
17 |
+
AfterIfMacros: true
|
18 |
+
AfterForeachMacros: false
|
19 |
+
SpacesBeforeTrailingComments: 2
|
20 |
+
IncludeBlocks: Regroup
|
21 |
+
IncludeCategories:
|
22 |
+
- Regex: '<xnnpack[./][[:alnum:].-]+>' # match XNNPack includes first
|
23 |
+
Priority: 5
|
24 |
+
- Regex: 'benchmark.h' # includes used in benchmarks
|
25 |
+
Priority: 3
|
26 |
+
- Regex: 'bench/' # includes used in benchmarks
|
27 |
+
Priority: 3
|
28 |
+
- Regex: 'gtest.h' # includes used in tests
|
29 |
+
Priority: 3
|
30 |
+
- Regex: 'gmock.h' # includes used in tests
|
31 |
+
Priority: 3
|
32 |
+
- Regex: '<[[:alnum:].]+>' # system headers
|
33 |
+
Priority: 2 # lower priority to keep it sorted first before XNNPack includes
|
34 |
+
MaxEmptyLinesToKeep: 2 # used to separate includes from functions
|
.gitattributes
CHANGED
@@ -121,3 +121,5 @@ fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/m
|
|
121 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
|
122 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
|
123 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
121 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
|
122 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
|
123 |
fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
|
124 |
+
build/CMakeFiles/microkernels-all.dir/build.make filter=lfs diff=lfs merge=lfs -text
|
125 |
+
build/libXNNPACK.a filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/build.yml
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Build using CMake
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
paths:
|
5 |
+
- '**.S'
|
6 |
+
- '**.c'
|
7 |
+
- '**.cc'
|
8 |
+
- '**.h'
|
9 |
+
- 'CMakeLists.txt'
|
10 |
+
- 'cmake/**'
|
11 |
+
- 'scripts/build-*.sh'
|
12 |
+
- '.github/**/*.yml'
|
13 |
+
concurrency:
|
14 |
+
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
|
15 |
+
cancel-in-progress: true
|
16 |
+
jobs:
|
17 |
+
cmake-linux-local:
|
18 |
+
runs-on: ubuntu-latest
|
19 |
+
timeout-minutes: 60
|
20 |
+
steps:
|
21 |
+
- uses: actions/checkout@v3
|
22 |
+
- name: Update apt
|
23 |
+
run: sudo apt update
|
24 |
+
- name: Install ninja
|
25 |
+
run: sudo apt install ninja-build
|
26 |
+
- name: Configure and build
|
27 |
+
run: scripts/build-local.sh
|
28 |
+
working-directory: ${{ github.workspace }}
|
29 |
+
cmake-linux-aarch64:
|
30 |
+
runs-on: ubuntu-22.04
|
31 |
+
timeout-minutes: 120
|
32 |
+
steps:
|
33 |
+
- uses: actions/checkout@v3
|
34 |
+
- name: Update apt
|
35 |
+
run: sudo apt update
|
36 |
+
- name: Install ninja
|
37 |
+
run: sudo apt install ninja-build
|
38 |
+
- name: Install aarch64 cross-toolchain
|
39 |
+
run: sudo apt install crossbuild-essential-arm64
|
40 |
+
- name: Install qemu-aarch64
|
41 |
+
run: sudo apt install qemu-user
|
42 |
+
- name: Configure and build
|
43 |
+
run: scripts/build-linux-aarch64.sh -DCMAKE_BUILD_TYPE=Release
|
44 |
+
working-directory: ${{ github.workspace }}
|
45 |
+
- name: Run tests
|
46 |
+
run: ctest --output-on-failure --parallel $(nproc)
|
47 |
+
working-directory: ${{ github.workspace }}/build/linux/aarch64
|
48 |
+
cmake-linux-armhf:
|
49 |
+
runs-on: ubuntu-22.04
|
50 |
+
timeout-minutes: 90
|
51 |
+
steps:
|
52 |
+
- uses: actions/checkout@v3
|
53 |
+
- name: Update apt
|
54 |
+
run: sudo apt update
|
55 |
+
- name: Install ninja
|
56 |
+
run: sudo apt install ninja-build
|
57 |
+
- name: Install armhf cross-toolchain
|
58 |
+
run: sudo apt install crossbuild-essential-armhf
|
59 |
+
- name: Install qemu-arm
|
60 |
+
run: sudo apt install qemu-user
|
61 |
+
- name: Configure and build
|
62 |
+
run: scripts/build-linux-armhf.sh -DCMAKE_BUILD_TYPE=Release
|
63 |
+
working-directory: ${{ github.workspace }}
|
64 |
+
- name: Run tests
|
65 |
+
run: ctest --output-on-failure --parallel $(nproc)
|
66 |
+
working-directory: ${{ github.workspace }}/build/linux/armhf
|
67 |
+
cmake-linux-riscv64:
|
68 |
+
runs-on: ubuntu-22.04
|
69 |
+
timeout-minutes: 60
|
70 |
+
steps:
|
71 |
+
- uses: actions/checkout@v3
|
72 |
+
- name: Update apt
|
73 |
+
run: sudo apt update
|
74 |
+
- name: Install ninja
|
75 |
+
run: sudo apt install ninja-build
|
76 |
+
- name: Install riscv64 cross-toolchain
|
77 |
+
run: sudo apt install crossbuild-essential-riscv64
|
78 |
+
- name: Install qemu-riscv64
|
79 |
+
run: sudo apt install qemu-user
|
80 |
+
- name: Configure and build
|
81 |
+
run: scripts/build-linux-riscv64.sh -DCMAKE_BUILD_TYPE=Release -DXNNPACK_ENABLE_RISCV_VECTOR=OFF
|
82 |
+
working-directory: ${{ github.workspace }}
|
83 |
+
- name: Run tests
|
84 |
+
run: ctest --output-on-failure --parallel $(nproc)
|
85 |
+
working-directory: ${{ github.workspace }}/build/linux/riscv64
|
86 |
+
cmake-windows-arm64:
|
87 |
+
runs-on: windows-latest
|
88 |
+
timeout-minutes: 120
|
89 |
+
steps:
|
90 |
+
- uses: actions/checkout@v3
|
91 |
+
- name: Configure and build
|
92 |
+
run: scripts/build-windows-arm64.cmd
|
93 |
+
shell: cmd
|
94 |
+
working-directory: ${{ github.workspace }}
|
95 |
+
cmake-windows-x64:
|
96 |
+
runs-on: windows-latest
|
97 |
+
timeout-minutes: 120
|
98 |
+
steps:
|
99 |
+
- uses: actions/checkout@v3
|
100 |
+
- name: Configure and build
|
101 |
+
run: scripts/build-windows-x64.cmd
|
102 |
+
shell: cmd
|
103 |
+
working-directory: ${{ github.workspace }}
|
104 |
+
env:
|
105 |
+
CFLAGS: "/UNDEBUG"
|
106 |
+
CXXFLAGS: "/UNDEBUG"
|
107 |
+
- name: Run tests
|
108 |
+
run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
|
109 |
+
working-directory: ${{ github.workspace }}/build/windows/x64
|
110 |
+
cmake-windows-x86:
|
111 |
+
runs-on: windows-latest
|
112 |
+
timeout-minutes: 120
|
113 |
+
steps:
|
114 |
+
- uses: actions/checkout@v3
|
115 |
+
- name: Configure and build
|
116 |
+
run: scripts/build-windows-x86.cmd
|
117 |
+
shell: cmd
|
118 |
+
working-directory: ${{ github.workspace }}
|
119 |
+
env:
|
120 |
+
CFLAGS: "/UNDEBUG"
|
121 |
+
CXXFLAGS: "/UNDEBUG"
|
122 |
+
- name: Run tests
|
123 |
+
run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
|
124 |
+
working-directory: ${{ github.workspace }}/build/windows/x86
|
125 |
+
cmake-macos-arm64:
|
126 |
+
runs-on: macos-latest
|
127 |
+
timeout-minutes: 60
|
128 |
+
steps:
|
129 |
+
- uses: actions/checkout@v3
|
130 |
+
- name: Create output directory
|
131 |
+
run: mkdir -p build/macos/arm64
|
132 |
+
working-directory: ${{ github.workspace }}
|
133 |
+
- name: Generate CMake project
|
134 |
+
run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=arm64 -DHAVE_STD_REGEX=TRUE ../../..
|
135 |
+
working-directory: ${{ github.workspace }}/build/macos/arm64
|
136 |
+
- name: Build with Xcode
|
137 |
+
run: cmake --build build/macos/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
|
138 |
+
working-directory: ${{ github.workspace }}
|
139 |
+
cmake-macos-x86_64:
|
140 |
+
runs-on: macos-latest
|
141 |
+
timeout-minutes: 90
|
142 |
+
steps:
|
143 |
+
- uses: actions/checkout@v3
|
144 |
+
- name: Create output directory
|
145 |
+
run: mkdir -p build/macos/x86_64
|
146 |
+
working-directory: ${{ github.workspace }}
|
147 |
+
- name: Generate CMake project
|
148 |
+
run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=x86_64 -DHAVE_STD_REGEX=TRUE ../../..
|
149 |
+
working-directory: ${{ github.workspace }}/build/macos/x86_64
|
150 |
+
- name: Build with Xcode
|
151 |
+
run: cmake --build build/macos/x86_64 --parallel $(sysctl -n hw.ncpu) -- -quiet
|
152 |
+
working-directory: ${{ github.workspace }}
|
153 |
+
- name: Run tests
|
154 |
+
run: ctest --build-config Debug --output-on-failure --parallel $(sysctl -n hw.ncpu)
|
155 |
+
working-directory: ${{ github.workspace }}/build/macos/x86_64
|
156 |
+
cmake-android:
|
157 |
+
strategy:
|
158 |
+
matrix:
|
159 |
+
script: [build-android-arm64.sh, build-android-armv7.sh, build-android-x86.sh]
|
160 |
+
runs-on: ubuntu-latest
|
161 |
+
timeout-minutes: 40
|
162 |
+
steps:
|
163 |
+
- uses: actions/checkout@v3
|
164 |
+
- name: Update apt
|
165 |
+
run: sudo apt update
|
166 |
+
- name: Install ninja
|
167 |
+
run: sudo apt install ninja-build
|
168 |
+
- name: Setup Android NDK
|
169 |
+
id: setup-ndk
|
170 |
+
uses: nttld/setup-ndk@v1
|
171 |
+
with:
|
172 |
+
ndk-version: r23b
|
173 |
+
add-to-path: false
|
174 |
+
- name: Configure and build
|
175 |
+
run: scripts/${{ matrix.script }}
|
176 |
+
working-directory: ${{ github.workspace }}
|
177 |
+
env:
|
178 |
+
ANDROID_NDK: ${{ steps.setup-ndk.outputs.ndk-path }}
|
179 |
+
cmake-ios-arm64:
|
180 |
+
runs-on: macos-latest
|
181 |
+
timeout-minutes: 60
|
182 |
+
steps:
|
183 |
+
- uses: actions/checkout@v3
|
184 |
+
- name: Create output directory
|
185 |
+
run: mkdir -p build/ios/arm64
|
186 |
+
working-directory: ${{ github.workspace }}
|
187 |
+
- name: Generate CMake project
|
188 |
+
run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=arm64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
|
189 |
+
working-directory: ${{ github.workspace }}/build/ios/arm64
|
190 |
+
- name: Build with Xcode
|
191 |
+
run: cmake --build build/ios/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
|
192 |
+
working-directory: ${{ github.workspace }}
|
193 |
+
cmake-ios-x86_64:
|
194 |
+
runs-on: macos-latest
|
195 |
+
timeout-minutes: 60
|
196 |
+
steps:
|
197 |
+
- uses: actions/checkout@v3
|
198 |
+
- name: Create output directory
|
199 |
+
run: mkdir -p build/ios/x86_64
|
200 |
+
working-directory: ${{ github.workspace }}
|
201 |
+
- name: Generate CMake project
|
202 |
+
run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=x86_64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
|
203 |
+
working-directory: ${{ github.workspace }}/build/ios/x86_64
|
204 |
+
- name: Build with Xcode
|
205 |
+
run: cmake --build build/ios/x86_64 --parallel $(sysctl -n hw.ncpu) -- -sdk iphonesimulator -quiet
|
206 |
+
working-directory: ${{ github.workspace }}
|
207 |
+
|
.gitignore
CHANGED
@@ -1,15 +1,35 @@
|
|
1 |
-
|
2 |
-
.
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
.DS_Store
|
11 |
-
|
12 |
-
|
13 |
-
.
|
14 |
-
.
|
15 |
-
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# Copyright 2019 Google LLC
|
5 |
+
#
|
6 |
+
# This source code is licensed under the BSD-style license found in the
|
7 |
+
# LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
# Build objects and artifacts
|
10 |
+
bazel-bin
|
11 |
+
bazel-genfiles
|
12 |
+
bazel-out
|
13 |
+
bazel-testlogs
|
14 |
+
bazel-XNNPACK
|
15 |
+
bin/
|
16 |
+
build/
|
17 |
+
build-*/
|
18 |
+
deps/
|
19 |
+
lib/
|
20 |
+
libs/
|
21 |
+
obj/
|
22 |
+
out/
|
23 |
+
*.pyc
|
24 |
+
*.pyo
|
25 |
+
*.log
|
26 |
+
|
27 |
+
# System files
|
28 |
.DS_Store
|
29 |
+
.DS_Store?
|
30 |
+
._*
|
31 |
+
.Spotlight-V100
|
32 |
+
.Trashes
|
33 |
+
ehthumbs.db
|
34 |
+
Thumbs.db
|
35 |
+
*.swp
|
BUILD.bazel
ADDED
The diff for this file is too large to render.
See raw diff
|
|
CMakeLists.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
CONTRIBUTING.md
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# How to Contribute
|
2 |
+
|
3 |
+
We'd love to accept your patches and contributions to this project. There are
|
4 |
+
just a few small guidelines you need to follow.
|
5 |
+
|
6 |
+
## Contributor License Agreement
|
7 |
+
|
8 |
+
Contributions to this project must be accompanied by a Contributor License
|
9 |
+
Agreement. You (or your employer) retain the copyright to your contribution;
|
10 |
+
this simply gives us permission to use and redistribute your contributions as
|
11 |
+
part of the project. Head over to <https://cla.developers.google.com/> to see
|
12 |
+
your current agreements on file or to sign a new one.
|
13 |
+
|
14 |
+
You generally only need to submit a CLA once, so if you've already submitted one
|
15 |
+
(even if it was for a different project), you probably don't need to do it
|
16 |
+
again.
|
17 |
+
|
18 |
+
## Code reviews
|
19 |
+
|
20 |
+
All submissions, including submissions by project members, require review. We
|
21 |
+
use GitHub pull requests for this purpose. Consult
|
22 |
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
23 |
+
information on using pull requests.
|
24 |
+
|
25 |
+
## Community Guidelines
|
26 |
+
|
27 |
+
This project follows [Google's Open Source Community
|
28 |
+
Guidelines](https://opensource.google.com/conduct/).
|
LICENSE
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
BSD License
|
2 |
+
|
3 |
+
For XNNPACK software
|
4 |
+
|
5 |
+
Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
6 |
+
Copyright 2019 Google LLC
|
7 |
+
|
8 |
+
Redistribution and use in source and binary forms, with or without modification,
|
9 |
+
are permitted provided that the following conditions are met:
|
10 |
+
|
11 |
+
* Redistributions of source code must retain the above copyright notice, this
|
12 |
+
list of conditions and the following disclaimer.
|
13 |
+
|
14 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
15 |
+
this list of conditions and the following disclaimer in the documentation
|
16 |
+
and/or other materials provided with the distribution.
|
17 |
+
|
18 |
+
* Neither the name Facebook nor the names of its contributors may be used to
|
19 |
+
endorse or promote products derived from this software without specific
|
20 |
+
prior written permission.
|
21 |
+
|
22 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
23 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
24 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
26 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
27 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
28 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
29 |
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
30 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
31 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
README.md
CHANGED
@@ -1,12 +1,130 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# XNNPACK
|
2 |
+
|
3 |
+
XNNPACK is a highly optimized solution for neural network inference on ARM, x86, WebAssembly, and RISC-V platforms. XNNPACK is not intended for direct use by deep learning practitioners and researchers; instead it provides low-level performance primitives for accelerating high-level machine learning frameworks, such as [TensorFlow Lite](https://www.tensorflow.org/lite), [TensorFlow.js](https://www.tensorflow.org/js), [PyTorch](https://pytorch.org/), [ONNX Runtime](https://onnxruntime.ai), and [MediaPipe](https://mediapipe.dev).
|
4 |
+
|
5 |
+
## Supported Architectures
|
6 |
+
|
7 |
+
- ARM64 on Android, iOS, macOS, Linux, and Windows
|
8 |
+
- ARMv7 (with NEON) on Android
|
9 |
+
- ARMv6 (with VFPv2) on Linux
|
10 |
+
- x86 and x86-64 (up to AVX512) on Windows, Linux, macOS, Android, and iOS simulator
|
11 |
+
- WebAssembly MVP
|
12 |
+
- WebAssembly SIMD
|
13 |
+
- [WebAssembly Relaxed SIMD](https://github.com/WebAssembly/relaxed-simd) (experimental)
|
14 |
+
- RISC-V (RV32GC and RV64GC)
|
15 |
+
|
16 |
+
## Operator Coverage
|
17 |
+
|
18 |
+
XNNPACK implements the following neural network operators:
|
19 |
+
|
20 |
+
- 2D Convolution (including grouped and depthwise)
|
21 |
+
- 2D Deconvolution (AKA Transposed Convolution)
|
22 |
+
- 2D Average Pooling
|
23 |
+
- 2D Max Pooling
|
24 |
+
- 2D ArgMax Pooling (Max Pooling + indices)
|
25 |
+
- 2D Unpooling
|
26 |
+
- 2D Bilinear Resize
|
27 |
+
- 2D Depth-to-Space (AKA Pixel Shuffle)
|
28 |
+
- Add (including broadcasting, two inputs only)
|
29 |
+
- Subtract (including broadcasting)
|
30 |
+
- Divide (including broadcasting)
|
31 |
+
- Maximum (including broadcasting)
|
32 |
+
- Minimum (including broadcasting)
|
33 |
+
- Multiply (including broadcasting)
|
34 |
+
- Squared Difference (including broadcasting)
|
35 |
+
- Global Average Pooling
|
36 |
+
- Channel Shuffle
|
37 |
+
- Fully Connected
|
38 |
+
- Abs (absolute value)
|
39 |
+
- Bankers' Rounding (rounding to nearest, ties to even)
|
40 |
+
- Ceiling (rounding to integer above)
|
41 |
+
- Clamp (includes ReLU and ReLU6)
|
42 |
+
- Convert (includes fixed-point and half-precision quantization and
|
43 |
+
dequantization)
|
44 |
+
- Copy
|
45 |
+
- ELU
|
46 |
+
- Floor (rounding to integer below)
|
47 |
+
- HardSwish
|
48 |
+
- Leaky ReLU
|
49 |
+
- Negate
|
50 |
+
- Sigmoid
|
51 |
+
- Softmax
|
52 |
+
- Square
|
53 |
+
- Tanh
|
54 |
+
- Transpose
|
55 |
+
- Truncation (rounding to integer towards zero)
|
56 |
+
- PReLU
|
57 |
+
|
58 |
+
All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the **C**hannel dimension. Thus, operators can consume a subset of channels in the input tensor, and produce a subset of channels in the output tensor, providing a zero-cost Channel Split and Channel Concatenation operations.
|
59 |
+
|
60 |
+
## Performance
|
61 |
+
|
62 |
+
### Mobile phones
|
63 |
+
|
64 |
+
The table below presents **single-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
|
65 |
+
|
66 |
+
| Model | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
|
67 |
+
| ----------------------- | :-------: | :---------: | :----------: |
|
68 |
+
| FP32 MobileNet v1 1.0X | 82 | 86 | 88 |
|
69 |
+
| FP32 MobileNet v2 1.0X | 49 | 53 | 55 |
|
70 |
+
| FP32 MobileNet v3 Large | 39 | 42 | 44 |
|
71 |
+
| FP32 MobileNet v3 Small | 12 | 14 | 14 |
|
72 |
+
|
73 |
+
The following table presents **multi-threaded** (using as many threads as there are big cores) performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
|
74 |
+
|
75 |
+
| Model | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
|
76 |
+
| ----------------------- | :-------: | :---------: | :----------: |
|
77 |
+
| FP32 MobileNet v1 1.0X | 43 | 27 | 46 |
|
78 |
+
| FP32 MobileNet v2 1.0X | 26 | 18 | 28 |
|
79 |
+
| FP32 MobileNet v3 Large | 22 | 16 | 24 |
|
80 |
+
| FP32 MobileNet v3 Small | 7 | 6 | 8 |
|
81 |
+
|
82 |
+
Benchmarked on March 27, 2020 with `end2end_bench --benchmark_min_time=5` on an Android/ARM64 build with Android NDK r21 (`bazel build -c opt --config android_arm64 :end2end_bench`) and neural network models with randomized weights and inputs.
|
83 |
+
|
84 |
+
### Raspberry Pi
|
85 |
+
|
86 |
+
The table below presents **multi-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Raspberry Pi boards.
|
87 |
+
|
88 |
+
| Model | RPi Zero W (BCM2835), ms | RPi 2 (BCM2836), ms | RPi 3+ (BCM2837B0), ms | RPi 4 (BCM2711), ms | RPi 4 (BCM2711, ARM64), ms |
|
89 |
+
| ----------------------- | :----------------------: | :-----------------: | :--------------------: | :-----------------: | :------------------------: |
|
90 |
+
| FP32 MobileNet v1 1.0X | 3919 | 302 | 114 | 72 | 77 |
|
91 |
+
| FP32 MobileNet v2 1.0X | 1987 | 191 | 79 | 41 | 46 |
|
92 |
+
| FP32 MobileNet v3 Large | 1658 | 161 | 67 | 38 | 40 |
|
93 |
+
| FP32 MobileNet v3 Small | 474 | 50 | 22 | 13 | 15 |
|
94 |
+
| INT8 MobileNet v1 1.0X | 2589 | 128 | 46 | 29 | 24 |
|
95 |
+
| INT8 MobileNet v2 1.0X | 1495 | 82 | 30 | 20 | 17 |
|
96 |
+
|
97 |
+
Benchmarked on Feb 8, 2022 with `end2end-bench --benchmark_min_time=5` on a Raspbian Buster build with CMake (`./scripts/build-local.sh`) and neural network models with randomized weights and inputs. INT8 inference was evaluated on per-channel quantization schema.
|
98 |
+
|
99 |
+
## Minimum build requirements
|
100 |
+
|
101 |
+
- C11
|
102 |
+
- C++14
|
103 |
+
- Python 3
|
104 |
+
|
105 |
+
## Publications
|
106 |
+
|
107 |
+
- Marat Dukhan "The Indirect Convolution Algorithm". Presented on [Efficient Deep Learning for Compute Vision (ECV) 2019](https://sites.google.com/corp/view/ecv2019/) workshop ([slides](https://drive.google.com/file/d/1ZayB3By5ZxxQIRtN7UDq_JvPg1IYd3Ac/view), [paper on ArXiv](https://arxiv.org/abs/1907.02129)).
|
108 |
+
- Erich Elsen, Marat Dukhan, Trevor Gale, Karen Simonyan "Fast Sparse ConvNets".
|
109 |
+
[Paper on ArXiv](https://arxiv.org/abs/1911.09723), [pre-trained sparse
|
110 |
+
models](https://github.com/google-research/google-research/tree/master/fastconvnets).
|
111 |
+
- Marat Dukhan, Artsiom Ablavatski "The Two-Pass Softmax Algorithm".
|
112 |
+
[Paper on ArXiv](https://arxiv.org/abs/2001.04438).
|
113 |
+
- Yury Pisarchyk, Juhyun Lee "Efficient Memory Management for Deep Neural Net Inference".
|
114 |
+
[Paper on ArXiv](https://arxiv.org/abs/2001.03288).
|
115 |
+
|
116 |
+
## Ecosystem
|
117 |
+
|
118 |
+
### Machine Learning Frameworks
|
119 |
+
|
120 |
+
- [TensorFlow Lite](https://blog.tensorflow.org/2020/07/accelerating-tensorflow-lite-xnnpack-integration.html).
|
121 |
+
- [TensorFlow.js WebAssembly backend](https://blog.tensorflow.org/2020/03/introducing-webassembly-backend-for-tensorflow-js.html).
|
122 |
+
- [PyTorch Mobile](https://pytorch.org/mobile).
|
123 |
+
- [ONNX Runtime Mobile](https://onnxruntime.ai/docs/execution-providers/Xnnpack-ExecutionProvider.html)
|
124 |
+
- [MediaPipe for the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html).
|
125 |
+
- [Alibaba HALO (Heterogeneity-Aware Lowering and Optimization)](https://github.com/alibaba/heterogeneity-aware-lowering-and-optimization)
|
126 |
+
- [Samsung ONE (On-device Neural Engine)](https://github.com/Samsung/ONE)
|
127 |
+
|
128 |
+
## Acknowledgements
|
129 |
+
|
130 |
+
XNNPACK is a based on [QNNPACK](https://github.com/pytorch/QNNPACK) library. Over time its codebase diverged a lot, and XNNPACK API is no longer compatible with QNNPACK.
|
WORKSPACE
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
workspace(name = "xnnpack")
|
2 |
+
|
3 |
+
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
4 |
+
|
5 |
+
# Bazel rule definitions
|
6 |
+
http_archive(
|
7 |
+
name = "rules_cc",
|
8 |
+
strip_prefix = "rules_cc-main",
|
9 |
+
urls = ["https://github.com/bazelbuild/rules_cc/archive/main.zip"],
|
10 |
+
)
|
11 |
+
|
12 |
+
# Bazel Skylib.
|
13 |
+
http_archive(
|
14 |
+
name = "bazel_skylib",
|
15 |
+
sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
|
16 |
+
urls = [
|
17 |
+
"https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
|
18 |
+
"https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
|
19 |
+
],
|
20 |
+
)
|
21 |
+
|
22 |
+
# Google Test framework, used by most unit-tests.
|
23 |
+
http_archive(
|
24 |
+
name = "com_google_googletest",
|
25 |
+
sha256 = "5cb522f1427558c6df572d6d0e1bf0fd076428633d080e88ad5312be0b6a8859",
|
26 |
+
strip_prefix = "googletest-e23cdb78e9fef1f69a9ef917f447add5638daf2a",
|
27 |
+
urls = ["https://github.com/google/googletest/archive/e23cdb78e9fef1f69a9ef917f447add5638daf2a.zip"],
|
28 |
+
)
|
29 |
+
|
30 |
+
# Google Benchmark library, used in micro-benchmarks.
|
31 |
+
http_archive(
|
32 |
+
name = "com_google_benchmark",
|
33 |
+
sha256 = "1ba14374fddcd9623f126b1a60945e4deac4cdc4fb25a5f25e7f779e36f2db52",
|
34 |
+
strip_prefix = "benchmark-d2a8a4ee41b923876c034afb939c4fc03598e622",
|
35 |
+
urls = ["https://github.com/google/benchmark/archive/d2a8a4ee41b923876c034afb939c4fc03598e622.zip"],
|
36 |
+
)
|
37 |
+
|
38 |
+
# FP16 library, used for half-precision conversions
|
39 |
+
http_archive(
|
40 |
+
name = "FP16",
|
41 |
+
build_file = "@//third_party:FP16.BUILD",
|
42 |
+
sha256 = "e66e65515fa09927b348d3d584c68be4215cfe664100d01c9dbc7655a5716d70",
|
43 |
+
strip_prefix = "FP16-0a92994d729ff76a58f692d3028ca1b64b145d91",
|
44 |
+
urls = [
|
45 |
+
"https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip",
|
46 |
+
],
|
47 |
+
)
|
48 |
+
|
49 |
+
# FXdiv library, used for repeated integer division by the same factor
|
50 |
+
http_archive(
|
51 |
+
name = "FXdiv",
|
52 |
+
sha256 = "ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d",
|
53 |
+
strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1",
|
54 |
+
urls = ["https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip"],
|
55 |
+
)
|
56 |
+
|
57 |
+
# pthreadpool library, used for parallelization
|
58 |
+
http_archive(
|
59 |
+
name = "pthreadpool",
|
60 |
+
sha256 = "e6370550a1abf1503daf3c2c196e0a1c2b253440c39e1a57740ff49af2d8bedf",
|
61 |
+
strip_prefix = "pthreadpool-43edadc654d6283b4b6e45ba09a853181ae8e850",
|
62 |
+
urls = ["https://github.com/Maratyszcza/pthreadpool/archive/43edadc654d6283b4b6e45ba09a853181ae8e850.zip"],
|
63 |
+
)
|
64 |
+
|
65 |
+
# cpuinfo library, used for detecting processor characteristics
|
66 |
+
http_archive(
|
67 |
+
name = "cpuinfo",
|
68 |
+
sha256 = "609fc42c47482c1fc125dccac65e843f640e792540162581c4b7eb6ff81c826a",
|
69 |
+
strip_prefix = "cpuinfo-87d8234510367db49a65535021af5e1838a65ac2",
|
70 |
+
urls = [
|
71 |
+
"https://github.com/pytorch/cpuinfo/archive/87d8234510367db49a65535021af5e1838a65ac2.zip",
|
72 |
+
],
|
73 |
+
)
|
74 |
+
|
75 |
+
# Ruy library, used to benchmark against
|
76 |
+
http_archive(
|
77 |
+
name = "ruy",
|
78 |
+
sha256 = "fe8345f521bb378745ebdd0f8c5937414849936851d2ec2609774eb2d7098e54",
|
79 |
+
strip_prefix = "ruy-9f53ba413e6fc879236dcaa3e008915973d67a4f",
|
80 |
+
urls = [
|
81 |
+
"https://github.com/google/ruy/archive/9f53ba413e6fc879236dcaa3e008915973d67a4f.zip",
|
82 |
+
],
|
83 |
+
)
|
84 |
+
|
85 |
+
# Android NDK location and version is auto-detected from $ANDROID_NDK_HOME environment variable
|
86 |
+
android_ndk_repository(name = "androidndk")
|
87 |
+
|
88 |
+
# Android SDK location and API is auto-detected from $ANDROID_HOME environment variable
|
89 |
+
android_sdk_repository(name = "androidsdk")
|
bench/abs.cc
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2021 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <limits>
|
11 |
+
#include <memory>
|
12 |
+
#include <random>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#include <fp16/fp16.h>
|
16 |
+
|
17 |
+
#include <xnnpack.h>
|
18 |
+
|
19 |
+
#include <benchmark/benchmark.h>
|
20 |
+
#include "bench/utils.h"
|
21 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
22 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
23 |
+
#include "tensorflow/lite/interpreter.h"
|
24 |
+
#include "tensorflow/lite/kernels/register.h"
|
25 |
+
#include "tensorflow/lite/model.h"
|
26 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
27 |
+
#include "tensorflow/lite/version.h"
|
28 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
29 |
+
|
30 |
+
|
31 |
+
static void xnnpack_abs_f16(benchmark::State& state) {
|
32 |
+
const size_t batch_size = state.range(0);
|
33 |
+
|
34 |
+
std::random_device random_device;
|
35 |
+
auto rng = std::mt19937(random_device());
|
36 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
37 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
38 |
+
|
39 |
+
std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
40 |
+
std::vector<uint16_t> output(batch_size);
|
41 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
42 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
43 |
+
|
44 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
45 |
+
if (status != xnn_status_success) {
|
46 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
47 |
+
return;
|
48 |
+
}
|
49 |
+
|
50 |
+
xnn_operator_t abs_op = nullptr;
|
51 |
+
status = xnn_create_abs_nc_f16(
|
52 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
53 |
+
0 /* flags */, &abs_op);
|
54 |
+
if (status != xnn_status_success || abs_op == nullptr) {
|
55 |
+
state.SkipWithError("failed to create Abs operator");
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
|
59 |
+
status = xnn_reshape_abs_nc_f16(abs_op, batch_size, /*threadpool=*/nullptr);
|
60 |
+
if (status != xnn_status_success) {
|
61 |
+
state.SkipWithError("failed to reshape Abs operator");
|
62 |
+
return;
|
63 |
+
}
|
64 |
+
|
65 |
+
status = xnn_setup_abs_nc_f16(abs_op, input.data(), output.data());
|
66 |
+
if (status != xnn_status_success) {
|
67 |
+
state.SkipWithError("failed to setup Abs operator");
|
68 |
+
return;
|
69 |
+
}
|
70 |
+
|
71 |
+
for (auto _ : state) {
|
72 |
+
status = xnn_run_operator(abs_op, nullptr /* thread pool */);
|
73 |
+
if (status != xnn_status_success) {
|
74 |
+
state.SkipWithError("failed to run Abs operator");
|
75 |
+
return;
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
status = xnn_delete_operator(abs_op);
|
80 |
+
if (status != xnn_status_success) {
|
81 |
+
state.SkipWithError("failed to delete Abs operator");
|
82 |
+
return;
|
83 |
+
}
|
84 |
+
|
85 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
86 |
+
if (cpu_frequency != 0) {
|
87 |
+
state.counters["cpufreq"] = cpu_frequency;
|
88 |
+
}
|
89 |
+
|
90 |
+
state.counters["elements"] =
|
91 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
92 |
+
|
93 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
|
94 |
+
state.counters["bytes"] =
|
95 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
96 |
+
}
|
97 |
+
|
98 |
+
static void xnnpack_abs_f32(benchmark::State& state) {
|
99 |
+
const size_t batch_size = state.range(0);
|
100 |
+
|
101 |
+
std::random_device random_device;
|
102 |
+
auto rng = std::mt19937(random_device());
|
103 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
104 |
+
|
105 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
106 |
+
std::vector<float> output(batch_size);
|
107 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
108 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
109 |
+
|
110 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
111 |
+
if (status != xnn_status_success) {
|
112 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
113 |
+
return;
|
114 |
+
}
|
115 |
+
|
116 |
+
xnn_operator_t abs_op = nullptr;
|
117 |
+
status = xnn_create_abs_nc_f32(
|
118 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
119 |
+
0 /* flags */, &abs_op);
|
120 |
+
if (status != xnn_status_success || abs_op == nullptr) {
|
121 |
+
state.SkipWithError("failed to create Abs operator");
|
122 |
+
return;
|
123 |
+
}
|
124 |
+
|
125 |
+
status = xnn_reshape_abs_nc_f32(abs_op, batch_size, /*threadpool=*/nullptr);
|
126 |
+
if (status != xnn_status_success) {
|
127 |
+
state.SkipWithError("failed to reshape Abs operator");
|
128 |
+
return;
|
129 |
+
}
|
130 |
+
|
131 |
+
status = xnn_setup_abs_nc_f32(abs_op, input.data(), output.data());
|
132 |
+
if (status != xnn_status_success) {
|
133 |
+
state.SkipWithError("failed to setup Abs operator");
|
134 |
+
return;
|
135 |
+
}
|
136 |
+
|
137 |
+
for (auto _ : state) {
|
138 |
+
status = xnn_run_operator(abs_op, nullptr /* thread pool */);
|
139 |
+
if (status != xnn_status_success) {
|
140 |
+
state.SkipWithError("failed to run Abs operator");
|
141 |
+
return;
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
status = xnn_delete_operator(abs_op);
|
146 |
+
if (status != xnn_status_success) {
|
147 |
+
state.SkipWithError("failed to delete Abs operator");
|
148 |
+
return;
|
149 |
+
}
|
150 |
+
|
151 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
152 |
+
if (cpu_frequency != 0) {
|
153 |
+
state.counters["cpufreq"] = cpu_frequency;
|
154 |
+
}
|
155 |
+
|
156 |
+
state.counters["elements"] =
|
157 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
158 |
+
|
159 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
160 |
+
state.counters["bytes"] =
|
161 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
162 |
+
}
|
163 |
+
|
164 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
165 |
+
static void tflite_abs_f32(benchmark::State& state) {
|
166 |
+
const size_t batch_size = state.range(0);
|
167 |
+
|
168 |
+
std::random_device random_device;
|
169 |
+
auto rng = std::mt19937(random_device());
|
170 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
171 |
+
|
172 |
+
flatbuffers::FlatBufferBuilder builder;
|
173 |
+
const flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
174 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_ABS);
|
175 |
+
|
176 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
177 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
178 |
+
}};
|
179 |
+
|
180 |
+
const std::array<int32_t, 1> shape{{
|
181 |
+
static_cast<int32_t>(batch_size)
|
182 |
+
}};
|
183 |
+
|
184 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
185 |
+
tflite::CreateTensor(builder,
|
186 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
187 |
+
tflite::TensorType_FLOAT32),
|
188 |
+
tflite::CreateTensor(builder,
|
189 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
190 |
+
tflite::TensorType_FLOAT32),
|
191 |
+
}};
|
192 |
+
|
193 |
+
const std::array<int32_t, 1> op_inputs{{ 0 }};
|
194 |
+
const std::array<int32_t, 1> op_outputs{{ 1 }};
|
195 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
|
196 |
+
builder,
|
197 |
+
0 /* opcode_index */,
|
198 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
199 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
200 |
+
|
201 |
+
const std::array<int32_t, 1> graph_inputs{{ 0 }};
|
202 |
+
const std::array<int32_t, 1> graph_outputs{{ 1 }};
|
203 |
+
const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
204 |
+
builder,
|
205 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
206 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
207 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
208 |
+
builder.CreateVector(&op, 1));
|
209 |
+
|
210 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
211 |
+
TFLITE_SCHEMA_VERSION,
|
212 |
+
builder.CreateVector(&operator_code, 1),
|
213 |
+
builder.CreateVector(&subgraph, 1),
|
214 |
+
builder.CreateString("Abs model"),
|
215 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
216 |
+
|
217 |
+
builder.Finish(model_buffer);
|
218 |
+
|
219 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
220 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
221 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
222 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
223 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
224 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
225 |
+
return;
|
226 |
+
}
|
227 |
+
interpreter->SetNumThreads(1);
|
228 |
+
|
229 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
230 |
+
state.SkipWithError("failed to allocate tensors");
|
231 |
+
return;
|
232 |
+
}
|
233 |
+
|
234 |
+
std::generate(
|
235 |
+
interpreter->typed_tensor<float>(0),
|
236 |
+
interpreter->typed_tensor<float>(0) + batch_size,
|
237 |
+
std::ref(f32rng));
|
238 |
+
|
239 |
+
for (auto _ : state) {
|
240 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
241 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
242 |
+
return;
|
243 |
+
}
|
244 |
+
}
|
245 |
+
|
246 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
247 |
+
if (cpu_frequency != 0) {
|
248 |
+
state.counters["cpufreq"] = cpu_frequency;
|
249 |
+
}
|
250 |
+
|
251 |
+
state.counters["elements"] =
|
252 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
253 |
+
|
254 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
255 |
+
state.counters["bytes"] =
|
256 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
257 |
+
|
258 |
+
interpreter.reset();
|
259 |
+
}
|
260 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
261 |
+
|
262 |
+
BENCHMARK(xnnpack_abs_f16)
|
263 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
264 |
+
->UseRealTime();
|
265 |
+
BENCHMARK(xnnpack_abs_f32)
|
266 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
267 |
+
->UseRealTime();
|
268 |
+
|
269 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
270 |
+
BENCHMARK(tflite_abs_f32)
|
271 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
272 |
+
->UseRealTime();
|
273 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
274 |
+
|
275 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
276 |
+
BENCHMARK_MAIN();
|
277 |
+
#endif
|
bench/average-pooling.cc
ADDED
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cfloat>
|
11 |
+
#include <cmath>
|
12 |
+
#include <functional>
|
13 |
+
#include <limits>
|
14 |
+
#include <memory>
|
15 |
+
#include <random>
|
16 |
+
#include <vector>
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
|
20 |
+
#include <benchmark/benchmark.h>
|
21 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
22 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
23 |
+
#include "tensorflow/lite/interpreter.h"
|
24 |
+
#include "tensorflow/lite/kernels/register.h"
|
25 |
+
#include "tensorflow/lite/model.h"
|
26 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
27 |
+
#include "tensorflow/lite/version.h"
|
28 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
29 |
+
#include "bench/utils.h"
|
30 |
+
|
31 |
+
static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
|
32 |
+
const size_t batch_size = state.range(0);
|
33 |
+
const size_t input_height = state.range(1);
|
34 |
+
const size_t input_width = state.range(2);
|
35 |
+
const size_t pooling_size = state.range(3);
|
36 |
+
const size_t padding_size = state.range(4);
|
37 |
+
const size_t stride = state.range(5);
|
38 |
+
const size_t channels = state.range(6);
|
39 |
+
|
40 |
+
std::random_device random_device;
|
41 |
+
auto rng = std::mt19937(random_device());
|
42 |
+
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
|
43 |
+
|
44 |
+
const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
|
45 |
+
const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
|
46 |
+
|
47 |
+
std::vector<uint8_t> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(uint8_t));
|
48 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
49 |
+
std::vector<uint8_t> output(batch_size * output_height * output_width * channels);
|
50 |
+
std::fill(output.begin(), output.end(), 0xA5);
|
51 |
+
|
52 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
53 |
+
if (status != xnn_status_success) {
|
54 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
55 |
+
return;
|
56 |
+
}
|
57 |
+
|
58 |
+
xnn_operator_t pooling_op = nullptr;
|
59 |
+
status = xnn_create_average_pooling2d_nhwc_qu8(
|
60 |
+
padding_size, padding_size, padding_size, padding_size,
|
61 |
+
pooling_size, pooling_size,
|
62 |
+
stride, stride,
|
63 |
+
channels, channels /* input pixel stride */, channels /* output pixel stride */,
|
64 |
+
127 /* input zero point */, 0.75f /* input scale */,
|
65 |
+
127 /* output zero point */, 1.25f /* output scale */,
|
66 |
+
0, 255,
|
67 |
+
0 /* flags */, &pooling_op);
|
68 |
+
if (status != xnn_status_success) {
|
69 |
+
state.SkipWithError("failed to create Average Pooling operator");
|
70 |
+
return;
|
71 |
+
}
|
72 |
+
|
73 |
+
status = xnn_reshape_average_pooling2d_nhwc_qu8(
|
74 |
+
pooling_op,
|
75 |
+
batch_size, input_height, input_width,
|
76 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
77 |
+
nullptr /* thread pool */);
|
78 |
+
if (status != xnn_status_success) {
|
79 |
+
state.SkipWithError("failed to reshape Average Pooling operator");
|
80 |
+
return;
|
81 |
+
}
|
82 |
+
|
83 |
+
status = xnn_setup_average_pooling2d_nhwc_qu8(
|
84 |
+
pooling_op,
|
85 |
+
input.data(), output.data());
|
86 |
+
if (status != xnn_status_success) {
|
87 |
+
state.SkipWithError("failed to setup Average Pooling operator");
|
88 |
+
return;
|
89 |
+
}
|
90 |
+
|
91 |
+
for (auto _ : state) {
|
92 |
+
status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
|
93 |
+
if (status != xnn_status_success) {
|
94 |
+
state.SkipWithError("failed to run Average Pooling operator");
|
95 |
+
return;
|
96 |
+
}
|
97 |
+
}
|
98 |
+
|
99 |
+
status = xnn_delete_operator(pooling_op);
|
100 |
+
if (status != xnn_status_success) {
|
101 |
+
state.SkipWithError("failed to delete Average Pooling operator");
|
102 |
+
return;
|
103 |
+
}
|
104 |
+
pooling_op = nullptr;
|
105 |
+
|
106 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
107 |
+
if (cpu_frequency != 0) {
|
108 |
+
state.counters["cpufreq"] = cpu_frequency;
|
109 |
+
}
|
110 |
+
|
111 |
+
state.counters["bytes"] = benchmark::Counter(
|
112 |
+
uint64_t(state.iterations()) *
|
113 |
+
batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(uint8_t),
|
114 |
+
benchmark::Counter::kIsRate);
|
115 |
+
}
|
116 |
+
|
117 |
+
static void xnnpack_average_pooling_f32(benchmark::State& state, const char* net) {
|
118 |
+
const size_t batch_size = state.range(0);
|
119 |
+
const size_t input_height = state.range(1);
|
120 |
+
const size_t input_width = state.range(2);
|
121 |
+
const size_t pooling_size = state.range(3);
|
122 |
+
const size_t padding_size = state.range(4);
|
123 |
+
const size_t stride = state.range(5);
|
124 |
+
const size_t channels = state.range(6);
|
125 |
+
|
126 |
+
std::random_device random_device;
|
127 |
+
auto rng = std::mt19937(random_device());
|
128 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
129 |
+
|
130 |
+
const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
|
131 |
+
const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
|
132 |
+
|
133 |
+
std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
|
134 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
135 |
+
std::vector<float> output(batch_size * output_height * output_width * channels);
|
136 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
137 |
+
|
138 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
139 |
+
if (status != xnn_status_success) {
|
140 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
141 |
+
return;
|
142 |
+
}
|
143 |
+
|
144 |
+
xnn_operator_t pooling_op = nullptr;
|
145 |
+
status = xnn_create_average_pooling2d_nhwc_f32(
|
146 |
+
padding_size, padding_size, padding_size, padding_size,
|
147 |
+
pooling_size, pooling_size,
|
148 |
+
stride, stride,
|
149 |
+
channels, channels /* input pixel stride */, channels /* output pixel stride */,
|
150 |
+
-std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
|
151 |
+
0 /* flags */, &pooling_op);
|
152 |
+
if (status != xnn_status_success) {
|
153 |
+
state.SkipWithError("failed to create Average Pooling operator");
|
154 |
+
return;
|
155 |
+
}
|
156 |
+
|
157 |
+
status = xnn_reshape_average_pooling2d_nhwc_f32(
|
158 |
+
pooling_op,
|
159 |
+
batch_size, input_height, input_width,
|
160 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
161 |
+
nullptr /* thread pool */);
|
162 |
+
if (status != xnn_status_success) {
|
163 |
+
state.SkipWithError("failed to reshape Average Pooling operator");
|
164 |
+
return;
|
165 |
+
}
|
166 |
+
|
167 |
+
status = xnn_setup_average_pooling2d_nhwc_f32(
|
168 |
+
pooling_op,
|
169 |
+
input.data(), output.data());
|
170 |
+
if (status != xnn_status_success) {
|
171 |
+
state.SkipWithError("failed to setup Average Pooling operator");
|
172 |
+
return;
|
173 |
+
}
|
174 |
+
|
175 |
+
for (auto _ : state) {
|
176 |
+
status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
|
177 |
+
if (status != xnn_status_success) {
|
178 |
+
state.SkipWithError("failed to run Average Pooling operator");
|
179 |
+
return;
|
180 |
+
}
|
181 |
+
}
|
182 |
+
|
183 |
+
status = xnn_delete_operator(pooling_op);
|
184 |
+
if (status != xnn_status_success) {
|
185 |
+
state.SkipWithError("failed to delete Average Pooling operator");
|
186 |
+
return;
|
187 |
+
}
|
188 |
+
pooling_op = nullptr;
|
189 |
+
|
190 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
191 |
+
if (cpu_frequency != 0) {
|
192 |
+
state.counters["cpufreq"] = cpu_frequency;
|
193 |
+
}
|
194 |
+
|
195 |
+
state.counters["bytes"] = benchmark::Counter(
|
196 |
+
uint64_t(state.iterations()) *
|
197 |
+
batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
|
198 |
+
benchmark::Counter::kIsRate);
|
199 |
+
}
|
200 |
+
|
201 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
202 |
+
void tflite_average_pooling_f32(benchmark::State& state, const char* net) {
|
203 |
+
const size_t batch_size = state.range(0);
|
204 |
+
const size_t input_height = state.range(1);
|
205 |
+
const size_t input_width = state.range(2);
|
206 |
+
const size_t pooling_size = state.range(3);
|
207 |
+
const size_t padding_size = state.range(4);
|
208 |
+
const size_t stride = state.range(5);
|
209 |
+
const size_t channels = state.range(6);
|
210 |
+
|
211 |
+
std::random_device random_device;
|
212 |
+
auto rng = std::mt19937(random_device());
|
213 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
214 |
+
|
215 |
+
tflite::Padding padding = tflite::Padding_VALID;
|
216 |
+
if (2 * padding_size == (pooling_size - 1)) {
|
217 |
+
padding = tflite::Padding_SAME;
|
218 |
+
} else if (padding_size == 0) {
|
219 |
+
padding = tflite::Padding_VALID;
|
220 |
+
} else {
|
221 |
+
state.SkipWithError("unsupported padding");
|
222 |
+
return;
|
223 |
+
}
|
224 |
+
|
225 |
+
const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
|
226 |
+
const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
|
227 |
+
|
228 |
+
std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
|
229 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
230 |
+
std::vector<float> output(batch_size * output_height * output_width * channels);
|
231 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
232 |
+
|
233 |
+
flatbuffers::FlatBufferBuilder builder;
|
234 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
235 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_AVERAGE_POOL_2D);
|
236 |
+
|
237 |
+
flatbuffers::Offset<tflite::Pool2DOptions> pool2d_options = CreatePool2DOptions(
|
238 |
+
builder, padding,
|
239 |
+
stride /* stride_w */, stride /* stride_h */,
|
240 |
+
pooling_size /* filter_width */, pooling_size /* filter_height */,
|
241 |
+
tflite::ActivationFunctionType_NONE);
|
242 |
+
|
243 |
+
flatbuffers::Offset<tflite::Buffer> buffers[1] = {
|
244 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
245 |
+
};
|
246 |
+
|
247 |
+
const int32_t input_shape[4] = {
|
248 |
+
static_cast<int32_t>(batch_size),
|
249 |
+
static_cast<int32_t>(input_height),
|
250 |
+
static_cast<int32_t>(input_width),
|
251 |
+
static_cast<int32_t>(channels)
|
252 |
+
};
|
253 |
+
const int32_t output_shape[4] = {
|
254 |
+
static_cast<int32_t>(batch_size),
|
255 |
+
static_cast<int32_t>(output_height),
|
256 |
+
static_cast<int32_t>(output_width),
|
257 |
+
static_cast<int32_t>(channels)
|
258 |
+
};
|
259 |
+
|
260 |
+
flatbuffers::Offset<tflite::Tensor> tensors[2] = {
|
261 |
+
tflite::CreateTensor(builder,
|
262 |
+
builder.CreateVector<int32_t>(input_shape, 4),
|
263 |
+
tflite::TensorType_FLOAT32),
|
264 |
+
tflite::CreateTensor(builder,
|
265 |
+
builder.CreateVector<int32_t>(output_shape, 4),
|
266 |
+
tflite::TensorType_FLOAT32),
|
267 |
+
};
|
268 |
+
|
269 |
+
const int32_t op_inputs[1] = { 0 };
|
270 |
+
const int32_t op_outputs[1] = { 1 };
|
271 |
+
flatbuffers::Offset<tflite::Operator> op = CreateOperator(
|
272 |
+
builder,
|
273 |
+
0 /* opcode_index */,
|
274 |
+
builder.CreateVector<int32_t>(op_inputs, 1),
|
275 |
+
builder.CreateVector<int32_t>(op_outputs, 1),
|
276 |
+
tflite::BuiltinOptions_Pool2DOptions,
|
277 |
+
pool2d_options.Union());
|
278 |
+
|
279 |
+
const int32_t graph_inputs[1] = { 0 };
|
280 |
+
const int32_t graph_outputs[1] = { 1 };
|
281 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
|
282 |
+
builder,
|
283 |
+
builder.CreateVector(tensors, 2),
|
284 |
+
builder.CreateVector<int32_t>(graph_inputs, 1),
|
285 |
+
builder.CreateVector<int32_t>(graph_outputs, 1),
|
286 |
+
builder.CreateVector(&op, 1));
|
287 |
+
|
288 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
289 |
+
TFLITE_SCHEMA_VERSION,
|
290 |
+
builder.CreateVector(&operator_code, 1),
|
291 |
+
builder.CreateVector(&subgraph, 1),
|
292 |
+
builder.CreateString("AVERAGE_POOL_2D model"),
|
293 |
+
builder.CreateVector(buffers, 1));
|
294 |
+
|
295 |
+
builder.Finish(model_buffer);
|
296 |
+
|
297 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
298 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
299 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
300 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
301 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk) {
|
302 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
303 |
+
return;
|
304 |
+
}
|
305 |
+
if (interpreter == nullptr) {
|
306 |
+
state.SkipWithError("TFLite interpreter is null");
|
307 |
+
return;
|
308 |
+
}
|
309 |
+
interpreter->SetNumThreads(1);
|
310 |
+
|
311 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
312 |
+
state.SkipWithError("failed to allocate tensors");
|
313 |
+
return;
|
314 |
+
}
|
315 |
+
|
316 |
+
std::generate(
|
317 |
+
interpreter->typed_tensor<float>(0),
|
318 |
+
interpreter->typed_tensor<float>(0) + batch_size * input_height * input_width * channels,
|
319 |
+
std::ref(f32rng));
|
320 |
+
|
321 |
+
for (auto _ : state) {
|
322 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
323 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
324 |
+
return;
|
325 |
+
}
|
326 |
+
}
|
327 |
+
|
328 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
329 |
+
if (cpu_frequency != 0) {
|
330 |
+
state.counters["cpufreq"] = cpu_frequency;
|
331 |
+
}
|
332 |
+
|
333 |
+
state.counters["bytes"] = benchmark::Counter(
|
334 |
+
uint64_t(state.iterations()) *
|
335 |
+
batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
|
336 |
+
benchmark::Counter::kIsRate);
|
337 |
+
}
|
338 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
339 |
+
|
340 |
+
// Final global average pooling in ImageNet classification models.
|
341 |
+
static void ImageNet(benchmark::internal::Benchmark* b) {
|
342 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
343 |
+
|
344 |
+
/* N H W K P S C */
|
345 |
+
b->Args({1, 13, 13, 13, 0, 1, 1000});
|
346 |
+
b->Args({1, 7, 7, 7, 0, 1, 1000});
|
347 |
+
}
|
348 |
+
|
349 |
+
// ShuffleNet v1 with 1 group.
|
350 |
+
static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
|
351 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
352 |
+
|
353 |
+
/* N H W K P S C */
|
354 |
+
b->Args({1, 56, 56, 3, 1, 2, 24});
|
355 |
+
b->Args({1, 28, 28, 3, 1, 2, 144});
|
356 |
+
b->Args({1, 14, 14, 3, 1, 2, 288});
|
357 |
+
b->Args({1, 7, 7, 3, 1, 2, 576});
|
358 |
+
}
|
359 |
+
|
360 |
+
// ShuffleNet v1 with 2 groups.
|
361 |
+
static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
|
362 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
363 |
+
|
364 |
+
/* N H W K P S C */
|
365 |
+
b->Args({1, 56, 56, 3, 1, 2, 24});
|
366 |
+
b->Args({1, 28, 28, 3, 1, 2, 200});
|
367 |
+
b->Args({1, 14, 14, 3, 1, 2, 400});
|
368 |
+
b->Args({1, 7, 7, 3, 1, 2, 800});
|
369 |
+
}
|
370 |
+
|
371 |
+
// ShuffleNet v1 with 3 groups.
|
372 |
+
static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
|
373 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
374 |
+
|
375 |
+
/* N H W K P S C */
|
376 |
+
b->Args({1, 56, 56, 3, 1, 2, 24});
|
377 |
+
b->Args({1, 28, 28, 3, 1, 2, 240});
|
378 |
+
b->Args({1, 14, 14, 3, 1, 2, 480});
|
379 |
+
b->Args({1, 7, 7, 3, 1, 2, 960});
|
380 |
+
}
|
381 |
+
|
382 |
+
// ShuffleNet v1 with 4 groups.
|
383 |
+
static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
|
384 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
385 |
+
|
386 |
+
/* N H W K P S C */
|
387 |
+
b->Args({1, 56, 56, 3, 1, 2, 24});
|
388 |
+
b->Args({1, 28, 28, 3, 1, 2, 272});
|
389 |
+
b->Args({1, 14, 14, 3, 1, 2, 576});
|
390 |
+
b->Args({1, 7, 7, 3, 1, 2, 1088});
|
391 |
+
}
|
392 |
+
|
393 |
+
// ShuffleNet v1 with 8 groups.
|
394 |
+
static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
|
395 |
+
b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
|
396 |
+
|
397 |
+
/* N H W K P S C */
|
398 |
+
b->Args({1, 56, 56, 3, 1, 2, 24});
|
399 |
+
b->Args({1, 28, 28, 3, 1, 2, 384});
|
400 |
+
b->Args({1, 14, 14, 3, 1, 2, 768});
|
401 |
+
b->Args({1, 7, 7, 3, 1, 2, 1536});
|
402 |
+
}
|
403 |
+
|
404 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
|
405 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
406 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
407 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
408 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
409 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
410 |
+
|
411 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
412 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
|
413 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
414 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
415 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
416 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
417 |
+
BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
418 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
419 |
+
|
420 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
|
421 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
422 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
423 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
424 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
425 |
+
BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
426 |
+
|
427 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
428 |
+
BENCHMARK_MAIN();
|
429 |
+
#endif
|
bench/bankers-rounding.cc
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2020 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <limits>
|
11 |
+
#include <memory>
|
12 |
+
#include <random>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#include <fp16/fp16.h>
|
16 |
+
|
17 |
+
#include <xnnpack.h>
|
18 |
+
|
19 |
+
#include <benchmark/benchmark.h>
|
20 |
+
#include "bench/utils.h"
|
21 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
22 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
23 |
+
#include "tensorflow/lite/interpreter.h"
|
24 |
+
#include "tensorflow/lite/kernels/register.h"
|
25 |
+
#include "tensorflow/lite/model.h"
|
26 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
27 |
+
#include "tensorflow/lite/version.h"
|
28 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
29 |
+
|
30 |
+
|
31 |
+
static void xnnpack_bankers_rounding_f16(benchmark::State& state) {
|
32 |
+
const size_t batch_size = state.range(0);
|
33 |
+
|
34 |
+
std::random_device random_device;
|
35 |
+
auto rng = std::mt19937(random_device());
|
36 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
37 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
38 |
+
|
39 |
+
std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
40 |
+
std::vector<uint16_t> output(batch_size);
|
41 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
42 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
43 |
+
|
44 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
45 |
+
if (status != xnn_status_success) {
|
46 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
47 |
+
return;
|
48 |
+
}
|
49 |
+
|
50 |
+
xnn_operator_t bankers_rounding_op = nullptr;
|
51 |
+
status = xnn_create_bankers_rounding_nc_f16(
|
52 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
53 |
+
0 /* flags */, &bankers_rounding_op);
|
54 |
+
if (status != xnn_status_success || bankers_rounding_op == nullptr) {
|
55 |
+
state.SkipWithError("failed to create Bankers' Rounding operator");
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
|
59 |
+
status = xnn_reshape_bankers_rounding_nc_f16(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
|
60 |
+
if (status != xnn_status_success) {
|
61 |
+
state.SkipWithError("failed to reshape Bankers' Rounding operator");
|
62 |
+
return;
|
63 |
+
}
|
64 |
+
|
65 |
+
status = xnn_setup_bankers_rounding_nc_f16(bankers_rounding_op, input.data(), output.data());
|
66 |
+
if (status != xnn_status_success) {
|
67 |
+
state.SkipWithError("failed to setup Bankers' Rounding operator");
|
68 |
+
return;
|
69 |
+
}
|
70 |
+
|
71 |
+
for (auto _ : state) {
|
72 |
+
status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
|
73 |
+
if (status != xnn_status_success) {
|
74 |
+
state.SkipWithError("failed to run Bankers' Rounding operator");
|
75 |
+
return;
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
status = xnn_delete_operator(bankers_rounding_op);
|
80 |
+
if (status != xnn_status_success) {
|
81 |
+
state.SkipWithError("failed to delete Bankers' Rounding operator");
|
82 |
+
return;
|
83 |
+
}
|
84 |
+
|
85 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
86 |
+
if (cpu_frequency != 0) {
|
87 |
+
state.counters["cpufreq"] = cpu_frequency;
|
88 |
+
}
|
89 |
+
|
90 |
+
state.counters["elements"] =
|
91 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
92 |
+
|
93 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
|
94 |
+
state.counters["bytes"] =
|
95 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
96 |
+
}
|
97 |
+
|
98 |
+
static void xnnpack_bankers_rounding_f32(benchmark::State& state) {
|
99 |
+
const size_t batch_size = state.range(0);
|
100 |
+
|
101 |
+
std::random_device random_device;
|
102 |
+
auto rng = std::mt19937(random_device());
|
103 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
104 |
+
|
105 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
106 |
+
std::vector<float> output(batch_size);
|
107 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
108 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
109 |
+
|
110 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
111 |
+
if (status != xnn_status_success) {
|
112 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
113 |
+
return;
|
114 |
+
}
|
115 |
+
|
116 |
+
xnn_operator_t bankers_rounding_op = nullptr;
|
117 |
+
status = xnn_create_bankers_rounding_nc_f32(
|
118 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
119 |
+
0 /* flags */, &bankers_rounding_op);
|
120 |
+
if (status != xnn_status_success || bankers_rounding_op == nullptr) {
|
121 |
+
state.SkipWithError("failed to create Bankers' Rounding operator");
|
122 |
+
return;
|
123 |
+
}
|
124 |
+
|
125 |
+
status = xnn_reshape_bankers_rounding_nc_f32(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
|
126 |
+
if (status != xnn_status_success) {
|
127 |
+
state.SkipWithError("failed to reshape Bankers' Rounding operator");
|
128 |
+
return;
|
129 |
+
}
|
130 |
+
|
131 |
+
status = xnn_setup_bankers_rounding_nc_f32(bankers_rounding_op, input.data(), output.data());
|
132 |
+
if (status != xnn_status_success) {
|
133 |
+
state.SkipWithError("failed to setup Bankers' Rounding operator");
|
134 |
+
return;
|
135 |
+
}
|
136 |
+
|
137 |
+
for (auto _ : state) {
|
138 |
+
status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
|
139 |
+
if (status != xnn_status_success) {
|
140 |
+
state.SkipWithError("failed to run Bankers' Rounding operator");
|
141 |
+
return;
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
status = xnn_delete_operator(bankers_rounding_op);
|
146 |
+
if (status != xnn_status_success) {
|
147 |
+
state.SkipWithError("failed to delete Bankers' Rounding operator");
|
148 |
+
return;
|
149 |
+
}
|
150 |
+
|
151 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
152 |
+
if (cpu_frequency != 0) {
|
153 |
+
state.counters["cpufreq"] = cpu_frequency;
|
154 |
+
}
|
155 |
+
|
156 |
+
state.counters["elements"] =
|
157 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
158 |
+
|
159 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
160 |
+
state.counters["bytes"] =
|
161 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
162 |
+
}
|
163 |
+
|
164 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
165 |
+
static void tflite_bankers_rounding_f32(benchmark::State& state) {
|
166 |
+
const size_t batch_size = state.range(0);
|
167 |
+
|
168 |
+
std::random_device random_device;
|
169 |
+
auto rng = std::mt19937(random_device());
|
170 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
171 |
+
|
172 |
+
flatbuffers::FlatBufferBuilder builder;
|
173 |
+
const flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
174 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_ROUND);
|
175 |
+
|
176 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
177 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
178 |
+
}};
|
179 |
+
|
180 |
+
const std::array<int32_t, 1> shape{{
|
181 |
+
static_cast<int32_t>(batch_size)
|
182 |
+
}};
|
183 |
+
|
184 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
185 |
+
tflite::CreateTensor(builder,
|
186 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
187 |
+
tflite::TensorType_FLOAT32),
|
188 |
+
tflite::CreateTensor(builder,
|
189 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
190 |
+
tflite::TensorType_FLOAT32),
|
191 |
+
}};
|
192 |
+
|
193 |
+
const std::array<int32_t, 1> op_inputs{{ 0 }};
|
194 |
+
const std::array<int32_t, 1> op_outputs{{ 1 }};
|
195 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
|
196 |
+
builder,
|
197 |
+
0 /* opcode_index */,
|
198 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
199 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
200 |
+
|
201 |
+
const std::array<int32_t, 1> graph_inputs{{ 0 }};
|
202 |
+
const std::array<int32_t, 1> graph_outputs{{ 1 }};
|
203 |
+
const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
204 |
+
builder,
|
205 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
206 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
207 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
208 |
+
builder.CreateVector(&op, 1));
|
209 |
+
|
210 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
211 |
+
TFLITE_SCHEMA_VERSION,
|
212 |
+
builder.CreateVector(&operator_code, 1),
|
213 |
+
builder.CreateVector(&subgraph, 1),
|
214 |
+
builder.CreateString("Round model"),
|
215 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
216 |
+
|
217 |
+
builder.Finish(model_buffer);
|
218 |
+
|
219 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
220 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
221 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
222 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
223 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
224 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
225 |
+
return;
|
226 |
+
}
|
227 |
+
interpreter->SetNumThreads(1);
|
228 |
+
|
229 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
230 |
+
state.SkipWithError("failed to allocate tensors");
|
231 |
+
return;
|
232 |
+
}
|
233 |
+
|
234 |
+
std::generate(
|
235 |
+
interpreter->typed_tensor<float>(0),
|
236 |
+
interpreter->typed_tensor<float>(0) + batch_size,
|
237 |
+
std::ref(f32rng));
|
238 |
+
|
239 |
+
for (auto _ : state) {
|
240 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
241 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
242 |
+
return;
|
243 |
+
}
|
244 |
+
}
|
245 |
+
|
246 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
247 |
+
if (cpu_frequency != 0) {
|
248 |
+
state.counters["cpufreq"] = cpu_frequency;
|
249 |
+
}
|
250 |
+
|
251 |
+
state.counters["elements"] =
|
252 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
253 |
+
|
254 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
255 |
+
state.counters["bytes"] =
|
256 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
257 |
+
|
258 |
+
interpreter.reset();
|
259 |
+
}
|
260 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
261 |
+
|
262 |
+
BENCHMARK(xnnpack_bankers_rounding_f16)
|
263 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
264 |
+
->UseRealTime();
|
265 |
+
BENCHMARK(xnnpack_bankers_rounding_f32)
|
266 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
267 |
+
->UseRealTime();
|
268 |
+
|
269 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
270 |
+
BENCHMARK(tflite_bankers_rounding_f32)
|
271 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
272 |
+
->UseRealTime();
|
273 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
274 |
+
|
275 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
276 |
+
BENCHMARK_MAIN();
|
277 |
+
#endif
|
bench/batch-matrix-multiply.cc
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cstddef>
|
8 |
+
#include <cstdint>
|
9 |
+
#include <functional>
|
10 |
+
#include <memory>
|
11 |
+
#include <random>
|
12 |
+
#include <utility>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#include <xnnpack.h>
|
16 |
+
|
17 |
+
#include <benchmark/benchmark.h>
|
18 |
+
#include "bench/utils.h"
|
19 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
20 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
21 |
+
#include "tensorflow/lite/interpreter.h"
|
22 |
+
#include "tensorflow/lite/kernels/register.h"
|
23 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
24 |
+
#include "tensorflow/lite/version.h"
|
25 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
26 |
+
|
27 |
+
void xnnpack_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
|
28 |
+
const size_t batch_size = state.range(0);
|
29 |
+
const size_t m = state.range(1);
|
30 |
+
const size_t k = state.range(1);
|
31 |
+
const size_t n = state.range(1);
|
32 |
+
|
33 |
+
std::random_device random_device;
|
34 |
+
auto rng = std::mt19937(random_device());
|
35 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
36 |
+
|
37 |
+
std::vector<float> input1(batch_size * m * k);
|
38 |
+
std::generate(input1.begin(), input1.end(), std::ref(f32rng));
|
39 |
+
std::vector<float> input2(batch_size * k * n);
|
40 |
+
std::generate(input2.begin(), input2.end(), std::ref(f32rng));
|
41 |
+
const size_t output_elements = batch_size * m * n;
|
42 |
+
|
43 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
44 |
+
if (status != xnn_status_success) {
|
45 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
46 |
+
return;
|
47 |
+
}
|
48 |
+
|
49 |
+
const size_t num_buffers =
|
50 |
+
1 + benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), sizeof(float) * (output_elements));
|
51 |
+
std::vector<float> output(output_elements * num_buffers);
|
52 |
+
|
53 |
+
std::vector<xnn_operator_t> ops(num_buffers);
|
54 |
+
|
55 |
+
for (xnn_operator_t& op : ops) {
|
56 |
+
status = xnn_create_batch_matrix_multiply_nc_f32(/*flags=*/0, &op);
|
57 |
+
if (status != xnn_status_success) {
|
58 |
+
state.SkipWithError("failed to create FP32 Convolution operator");
|
59 |
+
return;
|
60 |
+
}
|
61 |
+
}
|
62 |
+
|
63 |
+
std::vector<std::unique_ptr<std::vector<char>>> workspaces;
|
64 |
+
|
65 |
+
for (xnn_operator_t& op : ops) {
|
66 |
+
size_t workspace_size = 0;
|
67 |
+
size_t workspace_alignment = 0;
|
68 |
+
status =
|
69 |
+
xnn_reshape_batch_matrix_multiply_nc_f32(op, batch_size, m, k, n, &workspace_size, &workspace_alignment, nullptr);
|
70 |
+
|
71 |
+
auto workspace = std::make_unique<std::vector<char>>(workspace_size);
|
72 |
+
char* workspace_ptr = workspace->data();
|
73 |
+
|
74 |
+
workspaces.push_back(std::move(workspace));
|
75 |
+
|
76 |
+
if (status != xnn_status_success) {
|
77 |
+
state.SkipWithError("failed to create FP32 Convolution operator");
|
78 |
+
return;
|
79 |
+
}
|
80 |
+
|
81 |
+
status = xnn_setup_batch_matrix_multiply_nc_f32(op, workspace_ptr, input1.data(), input2.data(), output.data());
|
82 |
+
}
|
83 |
+
|
84 |
+
size_t buffer_index = 0;
|
85 |
+
for (auto _ : state) {
|
86 |
+
state.PauseTiming();
|
87 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
88 |
+
state.ResumeTiming();
|
89 |
+
|
90 |
+
status = xnn_run_operator(ops[buffer_index], /*threadpool=*/nullptr);
|
91 |
+
if (status != xnn_status_success) {
|
92 |
+
state.SkipWithError("failed to run FP32 Convolution operator");
|
93 |
+
return;
|
94 |
+
}
|
95 |
+
}
|
96 |
+
|
97 |
+
for (xnn_operator_t& convolution_op : ops) {
|
98 |
+
status = xnn_delete_operator(convolution_op);
|
99 |
+
if (status != xnn_status_success) {
|
100 |
+
state.SkipWithError("failed to delete FP32 Convolution operator");
|
101 |
+
return;
|
102 |
+
}
|
103 |
+
convolution_op = nullptr;
|
104 |
+
}
|
105 |
+
|
106 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
107 |
+
if (cpu_frequency != 0) {
|
108 |
+
state.counters["cpufreq"] = cpu_frequency;
|
109 |
+
}
|
110 |
+
|
111 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
112 |
+
uint64_t(state.iterations()) * batch_size * m * k * n,
|
113 |
+
benchmark::Counter::kIsRate);
|
114 |
+
}
|
115 |
+
|
116 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
117 |
+
void tflite_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
|
118 |
+
const size_t batch_size = state.range(0);
|
119 |
+
const size_t m = state.range(1);
|
120 |
+
const size_t k = state.range(1);
|
121 |
+
const size_t n = state.range(1);
|
122 |
+
|
123 |
+
std::random_device random_device;
|
124 |
+
auto rng = std::mt19937(random_device());
|
125 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
126 |
+
|
127 |
+
std::vector<float> input1(batch_size * m * k);
|
128 |
+
std::generate(input1.begin(), input1.end(), std::ref(f32rng));
|
129 |
+
std::vector<float> input2(batch_size * k * n);
|
130 |
+
std::generate(input2.begin(), input2.end(), std::ref(f32rng));
|
131 |
+
|
132 |
+
flatbuffers::FlatBufferBuilder builder;
|
133 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
134 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_BATCH_MATMUL, 0);
|
135 |
+
|
136 |
+
flatbuffers::Offset<tflite::BatchMatMulOptions> batch_mat_mul_options =
|
137 |
+
tflite::CreateBatchMatMulOptions(builder, false, false, false);
|
138 |
+
|
139 |
+
flatbuffers::Offset<tflite::Buffer> buffers[1] = {
|
140 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
141 |
+
};
|
142 |
+
|
143 |
+
const int32_t input1_shape[3] = {
|
144 |
+
static_cast<int32_t>(batch_size),
|
145 |
+
static_cast<int32_t>(m),
|
146 |
+
static_cast<int32_t>(k),
|
147 |
+
};
|
148 |
+
const int32_t input2_shape[3] = {
|
149 |
+
static_cast<int32_t>(batch_size),
|
150 |
+
static_cast<int32_t>(k),
|
151 |
+
static_cast<int32_t>(n),
|
152 |
+
};
|
153 |
+
const int32_t output_shape[3] = {
|
154 |
+
static_cast<int32_t>(batch_size),
|
155 |
+
static_cast<int32_t>(m),
|
156 |
+
static_cast<int32_t>(n),
|
157 |
+
};
|
158 |
+
|
159 |
+
flatbuffers::Offset<tflite::Tensor> tensors[4] = {
|
160 |
+
tflite::CreateTensor(builder,
|
161 |
+
builder.CreateVector<int32_t>(input1_shape, 3),
|
162 |
+
tflite::TensorType_FLOAT32,
|
163 |
+
0 /* buffer id */,
|
164 |
+
builder.CreateString("input1")),
|
165 |
+
tflite::CreateTensor(builder,
|
166 |
+
builder.CreateVector<int32_t>(input2_shape, 3),
|
167 |
+
tflite::TensorType_FLOAT32,
|
168 |
+
0 /* buffer id */,
|
169 |
+
builder.CreateString("input2")),
|
170 |
+
tflite::CreateTensor(builder,
|
171 |
+
builder.CreateVector<int32_t>(output_shape, 2),
|
172 |
+
tflite::TensorType_FLOAT32,
|
173 |
+
0 /* buffer id */,
|
174 |
+
builder.CreateString("output")),
|
175 |
+
};
|
176 |
+
|
177 |
+
const int32_t op_inputs[2] = { 0, 1 };
|
178 |
+
const int32_t op_outputs[1] = { 2 };
|
179 |
+
flatbuffers::Offset<tflite::Operator> op = CreateOperator(
|
180 |
+
builder,
|
181 |
+
0 /* opcode_index */,
|
182 |
+
builder.CreateVector<int32_t>(op_inputs, 2),
|
183 |
+
builder.CreateVector<int32_t>(op_outputs, 1),
|
184 |
+
tflite::BuiltinOptions_BatchMatMulOptions,
|
185 |
+
batch_mat_mul_options.Union());
|
186 |
+
|
187 |
+
const int32_t graph_inputs[2] = { 0, 1 };
|
188 |
+
const int32_t graph_outputs[1] = { 2 };
|
189 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
|
190 |
+
builder,
|
191 |
+
builder.CreateVector(tensors, 3),
|
192 |
+
builder.CreateVector<int32_t>(graph_inputs, 2),
|
193 |
+
builder.CreateVector<int32_t>(graph_outputs, 1),
|
194 |
+
builder.CreateVector(&op, 1),
|
195 |
+
builder.CreateString("BatchMatMul subgraph"));
|
196 |
+
|
197 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("BatchMatMul model");
|
198 |
+
|
199 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
200 |
+
TFLITE_SCHEMA_VERSION,
|
201 |
+
builder.CreateVector(&operator_code, 1),
|
202 |
+
builder.CreateVector(&subgraph, 1),
|
203 |
+
description,
|
204 |
+
builder.CreateVector(buffers, 1));
|
205 |
+
|
206 |
+
builder.Finish(model_buffer);
|
207 |
+
|
208 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
209 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
210 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
211 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
212 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk) {
|
213 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
214 |
+
return;
|
215 |
+
}
|
216 |
+
if (interpreter == nullptr) {
|
217 |
+
state.SkipWithError("TFLite interpreter is null");
|
218 |
+
return;
|
219 |
+
}
|
220 |
+
interpreter->SetNumThreads(1);
|
221 |
+
|
222 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
223 |
+
state.SkipWithError("failed to allocate tensors");
|
224 |
+
return;
|
225 |
+
}
|
226 |
+
|
227 |
+
std::generate(
|
228 |
+
interpreter->typed_tensor<float>(0),
|
229 |
+
interpreter->typed_tensor<float>(0) + batch_size * m * k,
|
230 |
+
std::ref(f32rng));
|
231 |
+
|
232 |
+
std::generate(
|
233 |
+
interpreter->typed_tensor<float>(1),
|
234 |
+
interpreter->typed_tensor<float>(1) + batch_size * k * n,
|
235 |
+
std::ref(f32rng));
|
236 |
+
|
237 |
+
for (auto _ : state) {
|
238 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
239 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
240 |
+
return;
|
241 |
+
}
|
242 |
+
}
|
243 |
+
|
244 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
245 |
+
if (cpu_frequency != 0) {
|
246 |
+
state.counters["cpufreq"] = cpu_frequency;
|
247 |
+
}
|
248 |
+
|
249 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
250 |
+
uint64_t(state.iterations()) * batch_size * m * k * n,
|
251 |
+
benchmark::Counter::kIsRate);
|
252 |
+
|
253 |
+
interpreter.reset();
|
254 |
+
}
|
255 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
256 |
+
|
257 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
258 |
+
BENCHMARK_MAIN();
|
259 |
+
#endif
|
bench/bf16-gemm.cc
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/gemm.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/gemm.h>
|
22 |
+
#include <xnnpack/math.h>
|
23 |
+
#include <xnnpack/pack.h>
|
24 |
+
#include <xnnpack/microfnptr.h>
|
25 |
+
#include <xnnpack/microparams-init.h>
|
26 |
+
|
27 |
+
|
28 |
+
static void bf16_gemm(benchmark::State& state,
|
29 |
+
xnn_bf16_gemm_minmax_ukernel_fn gemm,
|
30 |
+
size_t mr, size_t nr, size_t kr, size_t sr,
|
31 |
+
xnn_init_bf16_minmax_params_fn init_params,
|
32 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
33 |
+
{
|
34 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
35 |
+
return;
|
36 |
+
}
|
37 |
+
|
38 |
+
const size_t mc = state.range(0);
|
39 |
+
const size_t nc = state.range(1);
|
40 |
+
const size_t kc = state.range(2);
|
41 |
+
|
42 |
+
const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
|
43 |
+
const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
|
44 |
+
|
45 |
+
std::random_device random_device;
|
46 |
+
auto rng = std::mt19937(random_device());
|
47 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
48 |
+
|
49 |
+
std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
50 |
+
std::generate(a.begin(), a.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
|
51 |
+
std::vector<uint16_t> k(nc * kc);
|
52 |
+
std::generate(k.begin(), k.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
|
53 |
+
std::vector<uint16_t> b(nc);
|
54 |
+
std::generate(b.begin(), b.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
|
55 |
+
|
56 |
+
const size_t w_elements = nc_stride * kc_stride + nc_stride;
|
57 |
+
const size_t c_elements = mc * nc;
|
58 |
+
const size_t num_buffers = 1 +
|
59 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
60 |
+
sizeof(uint16_t) * (w_elements + c_elements));
|
61 |
+
|
62 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
63 |
+
std::fill(w.begin(), w.end(), 0);
|
64 |
+
xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
|
65 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
66 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7FC0) /* NaN */);
|
67 |
+
|
68 |
+
// Prepare minmax parameters.
|
69 |
+
xnn_bf16_minmax_params params;
|
70 |
+
init_params(¶ms,
|
71 |
+
UINT16_C(0xFF80) /* -inf */, UINT16_C(0x7F80) /* inf */);
|
72 |
+
|
73 |
+
size_t buffer_index = 0;
|
74 |
+
for (auto _ : state) {
|
75 |
+
// Use circular buffers (exceeding cache size) and prefetch to control cache state:
|
76 |
+
// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
|
77 |
+
// - W is not in cache (for any cache level)
|
78 |
+
// - C is not in cache (for any cache level)
|
79 |
+
state.PauseTiming();
|
80 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
81 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
82 |
+
state.ResumeTiming();
|
83 |
+
|
84 |
+
for (uint32_t m = 0; m < mc; m += mr) {
|
85 |
+
const uint32_t mb = min(mc - m, mr);
|
86 |
+
for (uint32_t n = 0; n < nc; n += nr) {
|
87 |
+
const uint32_t nb = min(nc - n, nr);
|
88 |
+
gemm(
|
89 |
+
mb, nb, kc * sizeof(uint16_t),
|
90 |
+
a.data() + m * kc, kc * sizeof(uint16_t),
|
91 |
+
w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
|
92 |
+
c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
|
93 |
+
¶ms);
|
94 |
+
}
|
95 |
+
}
|
96 |
+
}
|
97 |
+
|
98 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
99 |
+
if (cpu_frequency != 0) {
|
100 |
+
state.counters["cpufreq"] = cpu_frequency;
|
101 |
+
}
|
102 |
+
|
103 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
104 |
+
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
|
105 |
+
}
|
106 |
+
|
107 |
+
|
108 |
+
#if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
109 |
+
static void bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
|
110 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 1, 8, 2, 1,
|
111 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
112 |
+
}
|
113 |
+
static void bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
|
114 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 4, 8, 2, 1,
|
115 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
116 |
+
}
|
117 |
+
static void bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
|
118 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 5, 8, 2, 1,
|
119 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
120 |
+
}
|
121 |
+
static void bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
|
122 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 6, 8, 2, 1,
|
123 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
124 |
+
}
|
125 |
+
|
126 |
+
static void bf16_gemm_1x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
|
127 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 1, 4, 8, 1,
|
128 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
129 |
+
}
|
130 |
+
static void bf16_gemm_2x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
|
131 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 2, 4, 8, 1,
|
132 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
133 |
+
}
|
134 |
+
static void bf16_gemm_3x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
|
135 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 3, 4, 8, 1,
|
136 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
137 |
+
}
|
138 |
+
static void bf16_gemm_4x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
|
139 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 4, 4, 8, 1,
|
140 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
141 |
+
}
|
142 |
+
static void bf16_gemm_5x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
|
143 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 5, 4, 8, 1,
|
144 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
145 |
+
}
|
146 |
+
|
147 |
+
static void bf16_gemm_1x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
|
148 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 1, 4, 8, 1,
|
149 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
150 |
+
}
|
151 |
+
static void bf16_gemm_2x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
|
152 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 2, 4, 8, 1,
|
153 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
154 |
+
}
|
155 |
+
static void bf16_gemm_3x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
|
156 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 3, 4, 8, 1,
|
157 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
158 |
+
}
|
159 |
+
static void bf16_gemm_4x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
|
160 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 4, 4, 8, 1,
|
161 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
162 |
+
}
|
163 |
+
static void bf16_gemm_5x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
|
164 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 5, 4, 8, 1,
|
165 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
|
166 |
+
}
|
167 |
+
|
168 |
+
BENCHMARK_GEMM(bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128)
|
169 |
+
BENCHMARK_GEMM(bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128)
|
170 |
+
BENCHMARK_GEMM(bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128)
|
171 |
+
BENCHMARK_GEMM(bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128)
|
172 |
+
|
173 |
+
BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfdot)
|
174 |
+
BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfdot)
|
175 |
+
BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfdot)
|
176 |
+
BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfdot)
|
177 |
+
BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfdot)
|
178 |
+
|
179 |
+
BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfmlal)
|
180 |
+
BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfmlal)
|
181 |
+
BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfmlal)
|
182 |
+
BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfmlal)
|
183 |
+
BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfmlal)
|
184 |
+
#endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
185 |
+
|
186 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
187 |
+
static void bf16_gemm_1x4c8__neonfma_zip(benchmark::State& state, const char* net) {
|
188 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 1, 4, 8, 1,
|
189 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
190 |
+
}
|
191 |
+
static void bf16_gemm_2x4c8__neonfma_zip(benchmark::State& state, const char* net) {
|
192 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 2, 4, 8, 1,
|
193 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
194 |
+
}
|
195 |
+
static void bf16_gemm_3x4c8__neonfma_zip(benchmark::State& state, const char* net) {
|
196 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 3, 4, 8, 1,
|
197 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
198 |
+
}
|
199 |
+
static void bf16_gemm_4x4c8__neonfma_zip(benchmark::State& state, const char* net) {
|
200 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 4, 4, 8, 1,
|
201 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
202 |
+
}
|
203 |
+
static void bf16_gemm_5x4c8__neonfma_zip(benchmark::State& state, const char* net) {
|
204 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 5, 4, 8, 1,
|
205 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
206 |
+
}
|
207 |
+
|
208 |
+
static void bf16_gemm_1x4c8__neonfma_shland(benchmark::State& state, const char* net) {
|
209 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 1, 4, 8, 1,
|
210 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
211 |
+
}
|
212 |
+
static void bf16_gemm_2x4c8__neonfma_shland(benchmark::State& state, const char* net) {
|
213 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 2, 4, 8, 1,
|
214 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
215 |
+
}
|
216 |
+
static void bf16_gemm_3x4c8__neonfma_shland(benchmark::State& state, const char* net) {
|
217 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 3, 4, 8, 1,
|
218 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
219 |
+
}
|
220 |
+
static void bf16_gemm_4x4c8__neonfma_shland(benchmark::State& state, const char* net) {
|
221 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 4, 4, 8, 1,
|
222 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
223 |
+
}
|
224 |
+
static void bf16_gemm_5x4c8__neonfma_shland(benchmark::State& state, const char* net) {
|
225 |
+
bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 5, 4, 8, 1,
|
226 |
+
xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
|
227 |
+
}
|
228 |
+
|
229 |
+
BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_zip)
|
230 |
+
BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_zip)
|
231 |
+
BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_zip)
|
232 |
+
BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_zip)
|
233 |
+
BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_zip)
|
234 |
+
|
235 |
+
BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_shland)
|
236 |
+
BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_shland)
|
237 |
+
BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_shland)
|
238 |
+
BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_shland)
|
239 |
+
BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_shland)
|
240 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
241 |
+
|
242 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
243 |
+
BENCHMARK_MAIN();
|
244 |
+
#endif
|
bench/bgemm.h
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#pragma once
|
7 |
+
|
8 |
+
#include <benchmark/benchmark.h>
|
9 |
+
|
10 |
+
#define BENCHMARK_BGEMM(bgemm_fn) \
|
11 |
+
BENCHMARK_CAPTURE(bgemm_fn, albert, "Albert")->Apply(AlbertBgemmArguments)->UseRealTime(); \
|
12 |
+
BENCHMARK_CAPTURE(bgemm_fn, mobilebert, "MobileBert")->Apply(MobilebertBgemmArguments)->UseRealTime(); \
|
13 |
+
BENCHMARK_CAPTURE(bgemm_fn, sd1x_diffusion, "SD1.X Diffusion")->Apply(SD1XDiffusionBgemmArguments)->UseRealTime(); \
|
14 |
+
BENCHMARK_CAPTURE(bgemm_fn, sd1x_encoder_decoder, "SD1.X Encoder-Decoder")->Apply(SD1XEncoderDecoderBgemmArguments)->UseRealTime(); \
|
15 |
+
BENCHMARK_CAPTURE(bgemm_fn, sd1x_text_encoder, "SD1.X Text Encoder")->Apply(SD1XTextEncoderBgemmArguments)->UseRealTime();
|
16 |
+
|
17 |
+
|
18 |
+
static void AlbertBgemmArguments(benchmark::internal::Benchmark* b) {
|
19 |
+
b->ArgNames({"B", "M", "N", "K"});
|
20 |
+
|
21 |
+
/* B M N K */
|
22 |
+
b->Args({12, 384, 64, 384});
|
23 |
+
b->Args({12, 384, 384, 64});
|
24 |
+
}
|
25 |
+
|
26 |
+
static void MobilebertBgemmArguments(benchmark::internal::Benchmark* b) {
|
27 |
+
b->ArgNames({"B", "M", "N", "K"});
|
28 |
+
|
29 |
+
/* B M N K */
|
30 |
+
b->Args({4, 384, 32, 384});
|
31 |
+
b->Args({4, 384, 384, 32});
|
32 |
+
}
|
33 |
+
|
34 |
+
static void SD1XDiffusionBgemmArguments(benchmark::internal::Benchmark* b) {
|
35 |
+
b->ArgNames({"B", "M", "N", "K"});
|
36 |
+
|
37 |
+
/* B M N K */
|
38 |
+
b->Args({8, 4096, 4096, 40});
|
39 |
+
b->Args({8, 4096, 40, 4096});
|
40 |
+
b->Args({8, 4096, 77, 40});
|
41 |
+
b->Args({8, 4096, 40, 77});
|
42 |
+
b->Args({8, 1024, 1024, 80});
|
43 |
+
b->Args({8, 1024, 80, 1024});
|
44 |
+
b->Args({8, 1024, 77, 80});
|
45 |
+
b->Args({8, 1024, 80, 77});
|
46 |
+
b->Args({8, 256, 256, 160});
|
47 |
+
b->Args({8, 256, 160, 256});
|
48 |
+
b->Args({8, 256, 77, 160});
|
49 |
+
b->Args({8, 256, 160, 77});
|
50 |
+
b->Args({8, 64, 64, 160});
|
51 |
+
b->Args({8, 64, 160, 64});
|
52 |
+
b->Args({8, 64, 77, 160});
|
53 |
+
b->Args({8, 64, 160, 77});
|
54 |
+
}
|
55 |
+
|
56 |
+
static void SD1XEncoderDecoderBgemmArguments(benchmark::internal::Benchmark* b) {
|
57 |
+
b->ArgNames({"B", "M", "N", "K"});
|
58 |
+
|
59 |
+
/* B M N K */
|
60 |
+
b->Args({1, 4096, 4096, 512});
|
61 |
+
b->Args({1, 512, 4096, 4096});
|
62 |
+
}
|
63 |
+
|
64 |
+
static void SD1XTextEncoderBgemmArguments(benchmark::internal::Benchmark* b) {
|
65 |
+
b->ArgNames({"B", "M", "N", "K"});
|
66 |
+
|
67 |
+
/* B M N K */
|
68 |
+
b->Args({12, 77, 77, 64});
|
69 |
+
b->Args({12, 77, 64, 77});
|
70 |
+
}
|
bench/ceiling.cc
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2020 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <limits>
|
11 |
+
#include <memory>
|
12 |
+
#include <random>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#include <fp16/fp16.h>
|
16 |
+
|
17 |
+
#include <xnnpack.h>
|
18 |
+
|
19 |
+
#include <benchmark/benchmark.h>
|
20 |
+
#include "bench/utils.h"
|
21 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
22 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
23 |
+
#include "tensorflow/lite/interpreter.h"
|
24 |
+
#include "tensorflow/lite/kernels/register.h"
|
25 |
+
#include "tensorflow/lite/model.h"
|
26 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
27 |
+
#include "tensorflow/lite/version.h"
|
28 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
29 |
+
|
30 |
+
|
31 |
+
static void xnnpack_ceiling_f16(benchmark::State& state) {
|
32 |
+
const size_t batch_size = state.range(0);
|
33 |
+
|
34 |
+
std::random_device random_device;
|
35 |
+
auto rng = std::mt19937(random_device());
|
36 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
37 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
38 |
+
|
39 |
+
std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
40 |
+
std::vector<uint16_t> output(batch_size);
|
41 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
42 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
43 |
+
|
44 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
45 |
+
if (status != xnn_status_success) {
|
46 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
47 |
+
return;
|
48 |
+
}
|
49 |
+
|
50 |
+
xnn_operator_t ceiling_op = nullptr;
|
51 |
+
status = xnn_create_ceiling_nc_f16(
|
52 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
53 |
+
0 /* flags */, &ceiling_op);
|
54 |
+
if (status != xnn_status_success || ceiling_op == nullptr) {
|
55 |
+
state.SkipWithError("failed to create Ceiling operator");
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
|
59 |
+
status = xnn_reshape_ceiling_nc_f16(ceiling_op, batch_size, /*threadpool=*/nullptr);
|
60 |
+
if (status != xnn_status_success) {
|
61 |
+
state.SkipWithError("failed to reshape Ceiling operator");
|
62 |
+
return;
|
63 |
+
}
|
64 |
+
|
65 |
+
status = xnn_setup_ceiling_nc_f16(ceiling_op, input.data(), output.data());
|
66 |
+
if (status != xnn_status_success) {
|
67 |
+
state.SkipWithError("failed to setup Ceiling operator");
|
68 |
+
return;
|
69 |
+
}
|
70 |
+
|
71 |
+
for (auto _ : state) {
|
72 |
+
status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
|
73 |
+
if (status != xnn_status_success) {
|
74 |
+
state.SkipWithError("failed to run Ceiling operator");
|
75 |
+
return;
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
status = xnn_delete_operator(ceiling_op);
|
80 |
+
if (status != xnn_status_success) {
|
81 |
+
state.SkipWithError("failed to delete Ceiling operator");
|
82 |
+
return;
|
83 |
+
}
|
84 |
+
|
85 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
86 |
+
if (cpu_frequency != 0) {
|
87 |
+
state.counters["cpufreq"] = cpu_frequency;
|
88 |
+
}
|
89 |
+
|
90 |
+
state.counters["elements"] =
|
91 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
92 |
+
|
93 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
|
94 |
+
state.counters["bytes"] =
|
95 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
96 |
+
}
|
97 |
+
|
98 |
+
static void xnnpack_ceiling_f32(benchmark::State& state) {
|
99 |
+
const size_t batch_size = state.range(0);
|
100 |
+
|
101 |
+
std::random_device random_device;
|
102 |
+
auto rng = std::mt19937(random_device());
|
103 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
104 |
+
|
105 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
106 |
+
std::vector<float> output(batch_size);
|
107 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
108 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
109 |
+
|
110 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
111 |
+
if (status != xnn_status_success) {
|
112 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
113 |
+
return;
|
114 |
+
}
|
115 |
+
|
116 |
+
xnn_operator_t ceiling_op = nullptr;
|
117 |
+
status = xnn_create_ceiling_nc_f32(
|
118 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
119 |
+
0 /* flags */, &ceiling_op);
|
120 |
+
if (status != xnn_status_success || ceiling_op == nullptr) {
|
121 |
+
state.SkipWithError("failed to create Ceiling operator");
|
122 |
+
return;
|
123 |
+
}
|
124 |
+
|
125 |
+
status = xnn_reshape_ceiling_nc_f32(ceiling_op, batch_size, /*threadpool=*/nullptr);
|
126 |
+
if (status != xnn_status_success) {
|
127 |
+
state.SkipWithError("failed to reshape Ceiling operator");
|
128 |
+
return;
|
129 |
+
}
|
130 |
+
|
131 |
+
status = xnn_setup_ceiling_nc_f32(ceiling_op, input.data(), output.data());
|
132 |
+
if (status != xnn_status_success) {
|
133 |
+
state.SkipWithError("failed to setup Ceiling operator");
|
134 |
+
return;
|
135 |
+
}
|
136 |
+
|
137 |
+
for (auto _ : state) {
|
138 |
+
status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
|
139 |
+
if (status != xnn_status_success) {
|
140 |
+
state.SkipWithError("failed to run Ceiling operator");
|
141 |
+
return;
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
status = xnn_delete_operator(ceiling_op);
|
146 |
+
if (status != xnn_status_success) {
|
147 |
+
state.SkipWithError("failed to delete Ceiling operator");
|
148 |
+
return;
|
149 |
+
}
|
150 |
+
|
151 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
152 |
+
if (cpu_frequency != 0) {
|
153 |
+
state.counters["cpufreq"] = cpu_frequency;
|
154 |
+
}
|
155 |
+
|
156 |
+
state.counters["elements"] =
|
157 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
158 |
+
|
159 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
160 |
+
state.counters["bytes"] =
|
161 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
162 |
+
}
|
163 |
+
|
164 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
165 |
+
static void tflite_ceiling_f32(benchmark::State& state) {
|
166 |
+
const size_t batch_size = state.range(0);
|
167 |
+
|
168 |
+
std::random_device random_device;
|
169 |
+
auto rng = std::mt19937(random_device());
|
170 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
171 |
+
|
172 |
+
flatbuffers::FlatBufferBuilder builder;
|
173 |
+
const flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
174 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_CEIL);
|
175 |
+
|
176 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
177 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
178 |
+
}};
|
179 |
+
|
180 |
+
const std::array<int32_t, 1> shape{{
|
181 |
+
static_cast<int32_t>(batch_size)
|
182 |
+
}};
|
183 |
+
|
184 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
185 |
+
tflite::CreateTensor(builder,
|
186 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
187 |
+
tflite::TensorType_FLOAT32),
|
188 |
+
tflite::CreateTensor(builder,
|
189 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
190 |
+
tflite::TensorType_FLOAT32),
|
191 |
+
}};
|
192 |
+
|
193 |
+
const std::array<int32_t, 1> op_inputs{{ 0 }};
|
194 |
+
const std::array<int32_t, 1> op_outputs{{ 1 }};
|
195 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
|
196 |
+
builder,
|
197 |
+
0 /* opcode_index */,
|
198 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
199 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
200 |
+
|
201 |
+
const std::array<int32_t, 1> graph_inputs{{ 0 }};
|
202 |
+
const std::array<int32_t, 1> graph_outputs{{ 1 }};
|
203 |
+
const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
204 |
+
builder,
|
205 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
206 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
207 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
208 |
+
builder.CreateVector(&op, 1));
|
209 |
+
|
210 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
211 |
+
TFLITE_SCHEMA_VERSION,
|
212 |
+
builder.CreateVector(&operator_code, 1),
|
213 |
+
builder.CreateVector(&subgraph, 1),
|
214 |
+
builder.CreateString("Ceil model"),
|
215 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
216 |
+
|
217 |
+
builder.Finish(model_buffer);
|
218 |
+
|
219 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
220 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
221 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
222 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
223 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
224 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
225 |
+
return;
|
226 |
+
}
|
227 |
+
interpreter->SetNumThreads(1);
|
228 |
+
|
229 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
230 |
+
state.SkipWithError("failed to allocate tensors");
|
231 |
+
return;
|
232 |
+
}
|
233 |
+
|
234 |
+
std::generate(
|
235 |
+
interpreter->typed_tensor<float>(0),
|
236 |
+
interpreter->typed_tensor<float>(0) + batch_size,
|
237 |
+
std::ref(f32rng));
|
238 |
+
|
239 |
+
for (auto _ : state) {
|
240 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
241 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
242 |
+
return;
|
243 |
+
}
|
244 |
+
}
|
245 |
+
|
246 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
247 |
+
if (cpu_frequency != 0) {
|
248 |
+
state.counters["cpufreq"] = cpu_frequency;
|
249 |
+
}
|
250 |
+
|
251 |
+
state.counters["elements"] =
|
252 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
253 |
+
|
254 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
255 |
+
state.counters["bytes"] =
|
256 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
257 |
+
|
258 |
+
interpreter.reset();
|
259 |
+
}
|
260 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
261 |
+
|
262 |
+
BENCHMARK(xnnpack_ceiling_f16)
|
263 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
264 |
+
->UseRealTime();
|
265 |
+
BENCHMARK(xnnpack_ceiling_f32)
|
266 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
267 |
+
->UseRealTime();
|
268 |
+
|
269 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
270 |
+
BENCHMARK(tflite_ceiling_f32)
|
271 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
272 |
+
->UseRealTime();
|
273 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
274 |
+
|
275 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
276 |
+
BENCHMARK_MAIN();
|
277 |
+
#endif
|
bench/channel-shuffle.cc
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cmath>
|
11 |
+
#include <functional>
|
12 |
+
#include <limits>
|
13 |
+
#include <random>
|
14 |
+
#include <vector>
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
|
18 |
+
#include <benchmark/benchmark.h>
|
19 |
+
#include "bench/utils.h"
|
20 |
+
|
21 |
+
|
22 |
+
static void channel_shuffle_x8(benchmark::State& state, const char* net) {
|
23 |
+
const size_t batch_size = static_cast<size_t>(state.range(0));
|
24 |
+
const size_t groups = static_cast<size_t>(state.range(1));
|
25 |
+
const size_t group_channels = static_cast<size_t>(state.range(2));
|
26 |
+
|
27 |
+
std::random_device random_device;
|
28 |
+
auto rng = std::mt19937(random_device());
|
29 |
+
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
|
30 |
+
|
31 |
+
std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels);
|
32 |
+
std::vector<uint8_t> output(batch_size * groups * group_channels);
|
33 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
34 |
+
|
35 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
36 |
+
if (status != xnn_status_success) {
|
37 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
38 |
+
return;
|
39 |
+
}
|
40 |
+
|
41 |
+
xnn_operator_t channel_shuffle_op = nullptr;
|
42 |
+
status = xnn_create_channel_shuffle_nc_x8(
|
43 |
+
groups, group_channels,
|
44 |
+
groups * group_channels /* input stride */,
|
45 |
+
groups * group_channels /* output stride */,
|
46 |
+
0 /* flags */, &channel_shuffle_op);
|
47 |
+
if (status != xnn_status_success || channel_shuffle_op == nullptr) {
|
48 |
+
state.SkipWithError("failed to create X8 Channel Shuffle operator");
|
49 |
+
return;
|
50 |
+
}
|
51 |
+
|
52 |
+
status = xnn_reshape_channel_shuffle_nc_x8(
|
53 |
+
channel_shuffle_op,
|
54 |
+
batch_size,
|
55 |
+
nullptr /* thread pool */);
|
56 |
+
if (status != xnn_status_success) {
|
57 |
+
state.SkipWithError("failed to reshape X8 Channel Shuffle operator");
|
58 |
+
return;
|
59 |
+
}
|
60 |
+
|
61 |
+
status = xnn_setup_channel_shuffle_nc_x8(
|
62 |
+
channel_shuffle_op,
|
63 |
+
input.data(), output.data());
|
64 |
+
if (status != xnn_status_success) {
|
65 |
+
state.SkipWithError("failed to setup X8 Channel Shuffle operator");
|
66 |
+
return;
|
67 |
+
}
|
68 |
+
|
69 |
+
for (auto _ : state) {
|
70 |
+
status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
|
71 |
+
if (status != xnn_status_success) {
|
72 |
+
state.SkipWithError("failed to run X8 Channel Shuffle operator");
|
73 |
+
return;
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
status = xnn_delete_operator(channel_shuffle_op);
|
78 |
+
if (status != xnn_status_success) {
|
79 |
+
state.SkipWithError("failed to delete X8 Channel Shuffle operator");
|
80 |
+
return;
|
81 |
+
}
|
82 |
+
|
83 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
84 |
+
if (cpu_frequency != 0) {
|
85 |
+
state.counters["cpufreq"] = cpu_frequency;
|
86 |
+
}
|
87 |
+
|
88 |
+
const size_t elements_per_iteration = batch_size * groups * group_channels;
|
89 |
+
state.counters["elements"] =
|
90 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
91 |
+
|
92 |
+
const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t);
|
93 |
+
state.counters["bytes"] =
|
94 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
95 |
+
}
|
96 |
+
|
97 |
+
static void channel_shuffle_x32(benchmark::State& state, const char* net) {
|
98 |
+
const size_t batch_size = static_cast<size_t>(state.range(0));
|
99 |
+
const size_t groups = static_cast<size_t>(state.range(1));
|
100 |
+
const size_t group_channels = static_cast<size_t>(state.range(2));
|
101 |
+
|
102 |
+
std::random_device random_device;
|
103 |
+
auto rng = std::mt19937(random_device());
|
104 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
105 |
+
|
106 |
+
std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + batch_size * groups * group_channels);
|
107 |
+
std::vector<float> output(batch_size * groups * group_channels);
|
108 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
109 |
+
|
110 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
111 |
+
if (status != xnn_status_success) {
|
112 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
113 |
+
return;
|
114 |
+
}
|
115 |
+
|
116 |
+
xnn_operator_t channel_shuffle_op = nullptr;
|
117 |
+
status = xnn_create_channel_shuffle_nc_x32(
|
118 |
+
groups, group_channels,
|
119 |
+
groups * group_channels /* input stride */,
|
120 |
+
groups * group_channels /* output stride */,
|
121 |
+
0 /* flags */, &channel_shuffle_op);
|
122 |
+
if (status != xnn_status_success || channel_shuffle_op == nullptr) {
|
123 |
+
state.SkipWithError("failed to create X32 Channel Shuffle operator");
|
124 |
+
return;
|
125 |
+
}
|
126 |
+
|
127 |
+
status = xnn_reshape_channel_shuffle_nc_x32(
|
128 |
+
channel_shuffle_op,
|
129 |
+
batch_size,
|
130 |
+
nullptr /* thread pool */);
|
131 |
+
if (status != xnn_status_success) {
|
132 |
+
state.SkipWithError("failed to reshape X32 Channel Shuffle operator");
|
133 |
+
return;
|
134 |
+
}
|
135 |
+
|
136 |
+
status = xnn_setup_channel_shuffle_nc_x32(
|
137 |
+
channel_shuffle_op,
|
138 |
+
input.data(), output.data());
|
139 |
+
if (status != xnn_status_success) {
|
140 |
+
state.SkipWithError("failed to setup X32 Channel Shuffle operator");
|
141 |
+
return;
|
142 |
+
}
|
143 |
+
|
144 |
+
for (auto _ : state) {
|
145 |
+
status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
|
146 |
+
if (status != xnn_status_success) {
|
147 |
+
state.SkipWithError("failed to run X32 Channel Shuffle operator");
|
148 |
+
return;
|
149 |
+
}
|
150 |
+
}
|
151 |
+
|
152 |
+
status = xnn_delete_operator(channel_shuffle_op);
|
153 |
+
if (status != xnn_status_success) {
|
154 |
+
state.SkipWithError("failed to delete X32 Channel Shuffle operator");
|
155 |
+
return;
|
156 |
+
}
|
157 |
+
|
158 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
159 |
+
if (cpu_frequency != 0) {
|
160 |
+
state.counters["cpufreq"] = cpu_frequency;
|
161 |
+
}
|
162 |
+
|
163 |
+
const size_t elements_per_iteration = batch_size * groups * group_channels;
|
164 |
+
state.counters["elements"] =
|
165 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
166 |
+
|
167 |
+
const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
|
168 |
+
state.counters["bytes"] =
|
169 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
170 |
+
}
|
171 |
+
|
172 |
+
static void ShuffleNetV1G2Arguments(benchmark::internal::Benchmark* b)
|
173 |
+
{
|
174 |
+
b->ArgNames({"N", "G", "GC"});
|
175 |
+
|
176 |
+
/******** Stage 2 ********/
|
177 |
+
/* H W G CG */
|
178 |
+
b->Args({56 * 56, 2, 25});
|
179 |
+
b->Args({28 * 28, 2, 25});
|
180 |
+
|
181 |
+
/******** Stage 3 ********/
|
182 |
+
/* H W G CG */
|
183 |
+
b->Args({28 * 28, 2, 50});
|
184 |
+
b->Args({14 * 14, 2, 50});
|
185 |
+
|
186 |
+
/******** Stage 4 ********/
|
187 |
+
/* H W G CG */
|
188 |
+
b->Args({14 * 14, 2, 100});
|
189 |
+
b->Args({ 7 * 7, 2, 100});
|
190 |
+
}
|
191 |
+
|
192 |
+
static void ShuffleNetV1G3Arguments(benchmark::internal::Benchmark* b)
|
193 |
+
{
|
194 |
+
b->ArgNames({"N", "G", "GC"});
|
195 |
+
|
196 |
+
/******** Stage 2 *******/
|
197 |
+
/* H W G CG */
|
198 |
+
b->Args({56 * 56, 3, 20});
|
199 |
+
b->Args({28 * 28, 3, 20});
|
200 |
+
|
201 |
+
/******** Stage 3 *******/
|
202 |
+
/* H W G CG */
|
203 |
+
b->Args({28 * 28, 3, 40});
|
204 |
+
b->Args({14 * 14, 3, 40});
|
205 |
+
|
206 |
+
/******** Stage 4 *******/
|
207 |
+
/* H W G CG */
|
208 |
+
b->Args({14 * 14, 3, 80});
|
209 |
+
b->Args({ 7 * 7, 3, 80});
|
210 |
+
}
|
211 |
+
|
212 |
+
static void ShuffleNetV1G4Arguments(benchmark::internal::Benchmark* b)
|
213 |
+
{
|
214 |
+
b->ArgNames({"N", "G", "GC"});
|
215 |
+
|
216 |
+
/******** Stage 2 *******/
|
217 |
+
/* H W G CG */
|
218 |
+
b->Args({56 * 56, 4, 17});
|
219 |
+
b->Args({28 * 28, 4, 17});
|
220 |
+
|
221 |
+
/******** Stage 3 *******/
|
222 |
+
/* H W G CG */
|
223 |
+
b->Args({28 * 28, 4, 34});
|
224 |
+
b->Args({14 * 14, 4, 34});
|
225 |
+
|
226 |
+
/******** Stage 4 *******/
|
227 |
+
/* H W G CG */
|
228 |
+
b->Args({14 * 14, 4, 68});
|
229 |
+
b->Args({ 7 * 7, 4, 68});
|
230 |
+
}
|
231 |
+
|
232 |
+
static void ShuffleNetV1G8Arguments(benchmark::internal::Benchmark* b)
|
233 |
+
{
|
234 |
+
b->ArgNames({"N", "G", "GC"});
|
235 |
+
|
236 |
+
/******** Stage 2 *******/
|
237 |
+
/* H W G CG */
|
238 |
+
b->Args({56 * 56, 8, 12});
|
239 |
+
b->Args({28 * 28, 8, 12});
|
240 |
+
|
241 |
+
/******** Stage 3 *******/
|
242 |
+
/* H W G CG */
|
243 |
+
b->Args({28 * 28, 8, 24});
|
244 |
+
b->Args({14 * 14, 8, 24});
|
245 |
+
|
246 |
+
/******** Stage 4 *******/
|
247 |
+
/* H W G CG */
|
248 |
+
b->Args({14 * 14, 8, 48});
|
249 |
+
b->Args({ 7 * 7, 8, 48});
|
250 |
+
}
|
251 |
+
|
252 |
+
static void ShuffleNetV2x0_5Arguments(benchmark::internal::Benchmark* b)
|
253 |
+
{
|
254 |
+
b->ArgNames({"N", "G", "GC"});
|
255 |
+
|
256 |
+
/******** Stage 2 *******/
|
257 |
+
/* H W G CG */
|
258 |
+
b->Args({28 * 28, 2, 24});
|
259 |
+
|
260 |
+
/******** Stage 3 *******/
|
261 |
+
/* H W G CG */
|
262 |
+
b->Args({14 * 14, 2, 48});
|
263 |
+
|
264 |
+
/******** Stage 4 *******/
|
265 |
+
/* H W G CG */
|
266 |
+
b->Args({ 7 * 7, 2, 96});
|
267 |
+
}
|
268 |
+
|
269 |
+
static void ShuffleNetV2x1_0Arguments(benchmark::internal::Benchmark* b)
|
270 |
+
{
|
271 |
+
b->ArgNames({"N", "G", "GC"});
|
272 |
+
|
273 |
+
/******** Stage 2 ********/
|
274 |
+
/* H W G CG */
|
275 |
+
b->Args({28 * 28, 2, 58});
|
276 |
+
|
277 |
+
/******** Stage 3 ********/
|
278 |
+
/* H W G CG */
|
279 |
+
b->Args({14 * 14, 2, 116});
|
280 |
+
|
281 |
+
/******** Stage 4 ********/
|
282 |
+
/* H W G CG */
|
283 |
+
b->Args({ 7 * 7, 2, 232});
|
284 |
+
}
|
285 |
+
|
286 |
+
static void ShuffleNetV2x1_5Arguments(benchmark::internal::Benchmark* b)
|
287 |
+
{
|
288 |
+
b->ArgNames({"N", "G", "GC"});
|
289 |
+
|
290 |
+
/******** Stage 2 ********/
|
291 |
+
/* H W G CG */
|
292 |
+
b->Args({28 * 28, 2, 88});
|
293 |
+
|
294 |
+
/******** Stage 3 ********/
|
295 |
+
/* H W G CG */
|
296 |
+
b->Args({14 * 14, 2, 176});
|
297 |
+
|
298 |
+
/******** Stage 4 ********/
|
299 |
+
/* H W G CG */
|
300 |
+
b->Args({ 7 * 7, 2, 352});
|
301 |
+
}
|
302 |
+
|
303 |
+
static void ShuffleNetV2x2_0Arguments(benchmark::internal::Benchmark* b)
|
304 |
+
{
|
305 |
+
b->ArgNames({"N", "G", "GC"});
|
306 |
+
|
307 |
+
/******** Stage 2 ********/
|
308 |
+
/* H W G CG */
|
309 |
+
b->Args({28 * 28, 2, 122});
|
310 |
+
|
311 |
+
/******** Stage 3 ********/
|
312 |
+
/* H W G CG */
|
313 |
+
b->Args({14 * 14, 2, 244});
|
314 |
+
|
315 |
+
/******** Stage 4 ********/
|
316 |
+
/* H W G CG */
|
317 |
+
b->Args({ 7 * 7, 2, 488});
|
318 |
+
}
|
319 |
+
|
320 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
|
321 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
|
322 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
|
323 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
|
324 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
|
325 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
|
326 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
|
327 |
+
BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
|
328 |
+
|
329 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
|
330 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
|
331 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
|
332 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
|
333 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
|
334 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
|
335 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
|
336 |
+
BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
|
337 |
+
|
338 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
339 |
+
BENCHMARK_MAIN();
|
340 |
+
#endif
|
bench/conv.h
ADDED
@@ -0,0 +1,852 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
|
11 |
+
#include <benchmark/benchmark.h>
|
12 |
+
|
13 |
+
|
14 |
+
#define BENCHMARK_CONV(conv_fn) \
|
15 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1ConvArguments)->UseRealTime(); \
|
16 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2ConvArguments)->UseRealTime(); \
|
17 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallConvArguments)->UseRealTime(); \
|
18 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeConvArguments)->UseRealTime(); \
|
19 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1ConvArguments)->UseRealTime(); \
|
20 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2ConvArguments)->UseRealTime(); \
|
21 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3ConvArguments)->UseRealTime(); \
|
22 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4ConvArguments)->UseRealTime(); \
|
23 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8ConvArguments)->UseRealTime(); \
|
24 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05ConvArguments)->UseRealTime(); \
|
25 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10ConvArguments)->UseRealTime(); \
|
26 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15ConvArguments)->UseRealTime(); \
|
27 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20ConvArguments)->UseRealTime(); \
|
28 |
+
BENCHMARK_CAPTURE(conv_fn, inception_v3, "Inception v3")->Apply(InceptionV3ConvArguments)->UseRealTime(); \
|
29 |
+
BENCHMARK_CAPTURE(conv_fn, resnet18, "ResNet-18")->Apply(ResNet18ConvArguments)->UseRealTime(); \
|
30 |
+
BENCHMARK_CAPTURE(conv_fn, resnet50, "ResNet-50")->Apply(ResNet50ConvArguments)->UseRealTime(); \
|
31 |
+
BENCHMARK_CAPTURE(conv_fn, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10ConvArguments)->UseRealTime(); \
|
32 |
+
BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime(); \
|
33 |
+
BENCHMARK_CAPTURE(conv_fn, vgg, "VGG")->Apply(VGGConvArguments)->UseRealTime(); \
|
34 |
+
BENCHMARK_CAPTURE(conv_fn, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915ConvArguments)->UseRealTime(); \
|
35 |
+
BENCHMARK_CAPTURE(conv_fn, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935ConvArguments)->UseRealTime(); \
|
36 |
+
BENCHMARK_CAPTURE(conv_fn, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955ConvArguments)->UseRealTime();
|
37 |
+
|
38 |
+
|
39 |
+
// ShuffleNet v1 with 1 group.
|
40 |
+
static void ShuffleNetV1G1ConvArguments(benchmark::internal::Benchmark* b) {
|
41 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
42 |
+
|
43 |
+
/*********************** Conv 1 **********************/
|
44 |
+
/* H W KH KW PH PW S D GCin GCout */
|
45 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
46 |
+
/*************** Stage 2: stride-2 unit **************/
|
47 |
+
/* H W KH KW PH PW S D GCin GCout */
|
48 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 36});
|
49 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 36, 120});
|
50 |
+
/*************** Stage 2: stride-1 units *************/
|
51 |
+
/* H W KH KW PH PW S D GCin GCout */
|
52 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 36});
|
53 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 36, 144});
|
54 |
+
/*************** Stage 3: stride-2 unit **************/
|
55 |
+
/* H W KH KW PH PW S D GCin GCout */
|
56 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 72});
|
57 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 72, 144});
|
58 |
+
/*************** Stage 3: stride-1 units *************/
|
59 |
+
/* H W KH KW PH PW S D GCin GCout */
|
60 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 288, 72});
|
61 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 72, 288});
|
62 |
+
/*************** Stage 4: stride-2 unit **************/
|
63 |
+
/* H W KH KW PH PW S D GCin GCout */
|
64 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 288, 144});
|
65 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 144, 288});
|
66 |
+
/*************** Stage 4: stride-1 units *************/
|
67 |
+
/* H W KH KW PH PW S D GCin GCout */
|
68 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 144});
|
69 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 144, 576});
|
70 |
+
}
|
71 |
+
|
72 |
+
// ShuffleNet v1 with 2 groups.
|
73 |
+
static void ShuffleNetV1G2ConvArguments(benchmark::internal::Benchmark* b) {
|
74 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
75 |
+
|
76 |
+
/*********************** Conv 1 **********************/
|
77 |
+
/* H W KH KW PH PW S D GCin GCout */
|
78 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
79 |
+
/*************** Stage 2: stride-2 unit **************/
|
80 |
+
/* H W KH KW PH PW S D GCin GCout */
|
81 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 50});
|
82 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 25, 88});
|
83 |
+
/*************** Stage 2: stride-1 units *************/
|
84 |
+
/* H W KH KW PH PW S D GCin GCout */
|
85 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 100, 25});
|
86 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 25, 100});
|
87 |
+
/*************** Stage 3: stride-2 unit **************/
|
88 |
+
/* H W KH KW PH PW S D GCin GCout */
|
89 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 100, 50});
|
90 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 50, 100});
|
91 |
+
/*************** Stage 3: stride-1 units *************/
|
92 |
+
/* H W KH KW PH PW S D GCin GCout */
|
93 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 50});
|
94 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 50, 200});
|
95 |
+
/*************** Stage 4: stride-2 unit **************/
|
96 |
+
/* H W KH KW PH PW S D GCin GCout */
|
97 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 100});
|
98 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 100, 200});
|
99 |
+
/*************** Stage 4: stride-1 units *************/
|
100 |
+
/* H W KH KW PH PW S D GCin GCout */
|
101 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 400, 100});
|
102 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 100, 400});
|
103 |
+
}
|
104 |
+
|
105 |
+
// ShuffleNet v1 with 3 groups.
|
106 |
+
static void ShuffleNetV1G3ConvArguments(benchmark::internal::Benchmark* b) {
|
107 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
108 |
+
|
109 |
+
/*********************** Conv 1 **********************/
|
110 |
+
/* H W KH KW PH PW S D GCin GCout */
|
111 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
112 |
+
/*************** Stage 2: stride-2 unit **************/
|
113 |
+
/* H W KH KW PH PW S D GCin GCout */
|
114 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 60});
|
115 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 20, 72});
|
116 |
+
/*************** Stage 2: stride-1 units *************/
|
117 |
+
/* H W KH KW PH PW S D GCin GCout */
|
118 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 80, 20});
|
119 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 20, 80});
|
120 |
+
/*************** Stage 3: stride-2 unit **************/
|
121 |
+
/* H W KH KW PH PW S D GCin GCout */
|
122 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 80, 40});
|
123 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 80});
|
124 |
+
/*************** Stage 3: stride-1 units *************/
|
125 |
+
/* H W KH KW PH PW S D GCin GCout */
|
126 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 160, 40});
|
127 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 160});
|
128 |
+
/*************** Stage 4: stride-2 unit **************/
|
129 |
+
/* H W KH KW PH PW S D GCin GCout */
|
130 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 160, 80});
|
131 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 80, 160});
|
132 |
+
/*************** Stage 4: stride-1 units *************/
|
133 |
+
/* H W KH KW PH PW S D GCin GCout */
|
134 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 320, 80});
|
135 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 80, 320});
|
136 |
+
}
|
137 |
+
|
138 |
+
// ShuffleNet v1 with 4 groups.
|
139 |
+
static void ShuffleNetV1G4ConvArguments(benchmark::internal::Benchmark* b) {
|
140 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
141 |
+
|
142 |
+
/*********************** Conv 1 **********************/
|
143 |
+
/* H W KH KW PH PW S D GCin GCout */
|
144 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
145 |
+
/*************** Stage 2: stride-2 unit **************/
|
146 |
+
/* H W KH KW PH PW S D GCin GCout */
|
147 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 68});
|
148 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 17, 62});
|
149 |
+
/*************** Stage 2: stride-1 units *************/
|
150 |
+
/* H W KH KW PH PW S D GCin GCout */
|
151 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 68, 17});
|
152 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 17, 68});
|
153 |
+
/*************** Stage 3: stride-2 unit **************/
|
154 |
+
/* H W KH KW PH PW S D GCin GCout */
|
155 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 68, 34});
|
156 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 34, 68});
|
157 |
+
/*************** Stage 3: stride-1 units *************/
|
158 |
+
/* H W KH KW PH PW S D GCin GCout */
|
159 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 136, 34});
|
160 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 34, 136});
|
161 |
+
/*************** Stage 4: stride-2 unit **************/
|
162 |
+
/* H W KH KW PH PW S D GCin GCout */
|
163 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 136, 68});
|
164 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 68, 136});
|
165 |
+
/*************** Stage 4: stride-1 units *************/
|
166 |
+
/* H W KH KW PH PW S D GCin GCout */
|
167 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 272, 68});
|
168 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 68, 272});
|
169 |
+
}
|
170 |
+
|
171 |
+
// ShuffleNet v1 with 8 groups.
|
172 |
+
static void ShuffleNetV1G8ConvArguments(benchmark::internal::Benchmark* b) {
|
173 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
174 |
+
|
175 |
+
/*********************** Conv 1 **********************/
|
176 |
+
/* H W KH KW PH PW S D GCin GCout */
|
177 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
178 |
+
/*************** Stage 2: stride-2 unit **************/
|
179 |
+
/* H W KH KW PH PW S D GCin GCout */
|
180 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 96});
|
181 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 12, 45});
|
182 |
+
/*************** Stage 2: stride-1 units *************/
|
183 |
+
/* H W KH KW PH PW S D GCin GCout */
|
184 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 12});
|
185 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 12, 48});
|
186 |
+
/*************** Stage 3: stride-2 unit **************/
|
187 |
+
/* H W KH KW PH PW S D GCin GCout */
|
188 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 24});
|
189 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 24, 48});
|
190 |
+
/*************** Stage 3: stride-1 units *************/
|
191 |
+
/* H W KH KW PH PW S D GCin GCout */
|
192 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 24});
|
193 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 24, 96});
|
194 |
+
/*************** Stage 4: stride-2 unit **************/
|
195 |
+
/* H W KH KW PH PW S D GCin GCout */
|
196 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 48});
|
197 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 48, 96});
|
198 |
+
/*************** Stage 4: stride-1 units *************/
|
199 |
+
/* H W KH KW PH PW S D GCin GCout */
|
200 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 192, 48});
|
201 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 48, 192});
|
202 |
+
}
|
203 |
+
|
204 |
+
// ShuffleNet v2 (0.5X scale).
|
205 |
+
static void ShuffleNetV2X05ConvArguments(benchmark::internal::Benchmark* b) {
|
206 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
207 |
+
|
208 |
+
/*********************** Conv 1 **********************/
|
209 |
+
/* H W KH KW PH PW S D GCin GCout */
|
210 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
211 |
+
/********************** Stage 2 **********************/
|
212 |
+
/* H W KH KW PH PW S D GCin GCout */
|
213 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 24});
|
214 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 24});
|
215 |
+
/********************** Stage 3 **********************/
|
216 |
+
/* H W KH KW PH PW S D GCin GCout */
|
217 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 48});
|
218 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 48});
|
219 |
+
/********************** Stage 4 **********************/
|
220 |
+
/* H W KH KW PH PW S D GCin GCout */
|
221 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 96});
|
222 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 96});
|
223 |
+
/*********************** Conv 5 **********************/
|
224 |
+
/* H W KH KW PH PW S D GCin GCout */
|
225 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 192, 1024});
|
226 |
+
}
|
227 |
+
|
228 |
+
// ShuffleNet v2 (1.0X scale).
|
229 |
+
static void ShuffleNetV2X10ConvArguments(benchmark::internal::Benchmark* b) {
|
230 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
231 |
+
|
232 |
+
/*********************** Conv 1 **********************/
|
233 |
+
/* H W KH KW PH PW S D GCin GCout */
|
234 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
235 |
+
/********************** Stage 2 **********************/
|
236 |
+
/* H W KH KW PH PW S D GCin GCout */
|
237 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 58});
|
238 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 58});
|
239 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 58, 58});
|
240 |
+
/********************** Stage 3 **********************/
|
241 |
+
/* H W KH KW PH PW S D GCin GCout */
|
242 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 116, 116});
|
243 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 116, 116});
|
244 |
+
/********************** Stage 4 **********************/
|
245 |
+
/* H W KH KW PH PW S D GCin GCout */
|
246 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 232, 232});
|
247 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 232, 232});
|
248 |
+
/*********************** Conv 5 **********************/
|
249 |
+
/* H W KH KW PH PW S D GCin GCout */
|
250 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 464, 1024});
|
251 |
+
}
|
252 |
+
|
253 |
+
// ShuffleNet v2 (1.5X scale).
|
254 |
+
static void ShuffleNetV2X15ConvArguments(benchmark::internal::Benchmark* b) {
|
255 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
256 |
+
|
257 |
+
/*********************** Conv 1 **********************/
|
258 |
+
/* H W KH KW PH PW S D GCin GCout */
|
259 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
260 |
+
/********************** Stage 2 **********************/
|
261 |
+
/* H W KH KW PH PW S D GCin GCout */
|
262 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 88});
|
263 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 88});
|
264 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 88, 88});
|
265 |
+
/********************** Stage 3 **********************/
|
266 |
+
/* H W KH KW PH PW S D GCin GCout */
|
267 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 176, 176});
|
268 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 176, 176});
|
269 |
+
/********************** Stage 4 **********************/
|
270 |
+
/* H W KH KW PH PW S D GCin GCout */
|
271 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 352, 352});
|
272 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 352, 352});
|
273 |
+
/*********************** Conv 5 **********************/
|
274 |
+
/* H W KH KW PH PW S D GCin GCout */
|
275 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 704, 1024});
|
276 |
+
}
|
277 |
+
|
278 |
+
// ShuffleNet v2 (2.0X scale).
|
279 |
+
static void ShuffleNetV2X20ConvArguments(benchmark::internal::Benchmark* b) {
|
280 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
281 |
+
|
282 |
+
/*********************** Conv 1 **********************/
|
283 |
+
/* H W KH KW PH PW S D GCin GCout */
|
284 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
|
285 |
+
/********************** Stage 2 **********************/
|
286 |
+
/* H W KH KW PH PW S D GCin GCout */
|
287 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 122});
|
288 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 122});
|
289 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 122, 122});
|
290 |
+
/********************** Stage 3 **********************/
|
291 |
+
/* H W KH KW PH PW S D GCin GCout */
|
292 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 244, 244});
|
293 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 244, 244});
|
294 |
+
/********************** Stage 4 **********************/
|
295 |
+
/* H W KH KW PH PW S D GCin GCout */
|
296 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 488, 488});
|
297 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 488, 488});
|
298 |
+
/*********************** Conv 5 **********************/
|
299 |
+
/* H W KH KW PH PW S D GCin GCout */
|
300 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 976, 2048});
|
301 |
+
}
|
302 |
+
|
303 |
+
static void MobileNetV1ConvArguments(benchmark::internal::Benchmark* b) {
|
304 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
305 |
+
|
306 |
+
/* H W KH KW PH PW S D GCin GCout */
|
307 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 32});
|
308 |
+
b->Args({112, 112, 1, 1, 0, 0, 1, 1, 32, 64});
|
309 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 128});
|
310 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 128, 128});
|
311 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 256});
|
312 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 256, 256});
|
313 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 512});
|
314 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 512, 512});
|
315 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 1024});
|
316 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 1024, 1024});
|
317 |
+
}
|
318 |
+
|
319 |
+
static void MobileNetV2ConvArguments(benchmark::internal::Benchmark* b) {
|
320 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
321 |
+
|
322 |
+
/* H W KH KW PH PW S D GCin GCout */
|
323 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 32});
|
324 |
+
|
325 |
+
/******************** Bottleneck 1 *******************/
|
326 |
+
/* H W KH KW PH PW S D GCin GCout */
|
327 |
+
b->Args({112, 112, 1, 1, 0, 0, 1, 1, 32, 16});
|
328 |
+
|
329 |
+
/******************** Bottleneck 2 *******************/
|
330 |
+
/* H W KH KW PH PW S D GCin GCout */
|
331 |
+
b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 96});
|
332 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 96, 24});
|
333 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 144});
|
334 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 144, 24});
|
335 |
+
|
336 |
+
/******************** Bottleneck 3 *******************/
|
337 |
+
/* H W KH KW PH PW S D GCin GCout */
|
338 |
+
//b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 144});
|
339 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 32});
|
340 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
|
341 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 192, 32});
|
342 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
|
343 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 192, 32});
|
344 |
+
|
345 |
+
/******************** Bottleneck 4 *******************/
|
346 |
+
/* H W KH KW PH PW S D GCin GCout */
|
347 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
|
348 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 192, 64});
|
349 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
|
350 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
|
351 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
|
352 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
|
353 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
|
354 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
|
355 |
+
|
356 |
+
/******************** Bottleneck 5 *******************/
|
357 |
+
/* H W KH KW PH PW S D GCin GCout */
|
358 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
|
359 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 96});
|
360 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
|
361 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 576, 96});
|
362 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
|
363 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 576, 96});
|
364 |
+
|
365 |
+
/******************** Bottleneck 6 *******************/
|
366 |
+
/* H W KH KW PH PW S D GCin GCout */
|
367 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
|
368 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 160});
|
369 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
370 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
|
371 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
372 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
|
373 |
+
|
374 |
+
/******************** Bottleneck 7 *******************/
|
375 |
+
/* H W KH KW PH PW S D GCin GCout */
|
376 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
377 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 320});
|
378 |
+
|
379 |
+
/**************** Pre-pooling Conv2D *****************/
|
380 |
+
/* H W KH KW PH PW S D GCin GCout */
|
381 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 320, 1280});
|
382 |
+
/**************** Post-pooling Conv2D ****************/
|
383 |
+
/* H W KH KW PH PW S D GCin GCout */
|
384 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1280, 1000});
|
385 |
+
}
|
386 |
+
|
387 |
+
static void MobileNetV3SmallConvArguments(benchmark::internal::Benchmark* b) {
|
388 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
389 |
+
|
390 |
+
/******************* Initial Stage *******************/
|
391 |
+
/* H W KH KW PH PW S D GCin GCout */
|
392 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 16});
|
393 |
+
/******************** Bottleneck 1 *******************/
|
394 |
+
/* H W KH KW PH PW S D GCin GCout */
|
395 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 16, 8});
|
396 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 8, 16});
|
397 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 16, 16});
|
398 |
+
/******************** Bottleneck 2 *******************/
|
399 |
+
/* H W KH KW PH PW S D GCin GCout */
|
400 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 16, 72});
|
401 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 72, 24});
|
402 |
+
/******************** Bottleneck 3 *******************/
|
403 |
+
/* H W KH KW PH PW S D GCin GCout */
|
404 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 88});
|
405 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 88, 24});
|
406 |
+
/******************** Bottleneck 4 *******************/
|
407 |
+
/* H W KH KW PH PW S D GCin GCout */
|
408 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 96});
|
409 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 96, 24});
|
410 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 24, 96});
|
411 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 40});
|
412 |
+
/******************** Bottleneck 5 *******************/
|
413 |
+
/* H W KH KW PH PW S D GCin GCout */
|
414 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 240});
|
415 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 64});
|
416 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 64, 240});
|
417 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 40});
|
418 |
+
/******************** Bottleneck 6 *******************/
|
419 |
+
/* H W KH KW PH PW S D GCin GCout */
|
420 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 240});
|
421 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 64});
|
422 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 64, 240});
|
423 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 40});
|
424 |
+
/******************** Bottleneck 7 *******************/
|
425 |
+
/* H W KH KW PH PW S D GCin GCout */
|
426 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 120});
|
427 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
|
428 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
|
429 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 120, 48});
|
430 |
+
/******************** Bottleneck 8 *******************/
|
431 |
+
/* H W KH KW PH PW S D GCin GCout */
|
432 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 144});
|
433 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 40});
|
434 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 40, 144});
|
435 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 144, 48});
|
436 |
+
/******************** Bottleneck 9 *******************/
|
437 |
+
/* H W KH KW PH PW S D GCin GCout */
|
438 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 288});
|
439 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 288, 72});
|
440 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 72, 288});
|
441 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 288, 96});
|
442 |
+
/******************* Bottleneck 10 *******************/
|
443 |
+
/* H W KH KW PH PW S D GCin GCout */
|
444 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
|
445 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 144});
|
446 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 576});
|
447 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 96});
|
448 |
+
/******************* Bottleneck 11 *******************/
|
449 |
+
/* H W KH KW PH PW S D GCin GCout */
|
450 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
|
451 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 144});
|
452 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 576});
|
453 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 96});
|
454 |
+
/********************* Last Stage ********************/
|
455 |
+
/* H W KH KW PH PW S D GCin GCout */
|
456 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
|
457 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 1024});
|
458 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1024, 1001});
|
459 |
+
}
|
460 |
+
|
461 |
+
static void MobileNetV3LargeConvArguments(benchmark::internal::Benchmark* b) {
|
462 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
463 |
+
|
464 |
+
/******************* Initial Stage *******************/
|
465 |
+
/* H W KH KW PH PW S D GCin GCout */
|
466 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 16});
|
467 |
+
/******************** Bottleneck 1 *******************/
|
468 |
+
/* H W KH KW PH PW S D GCin GCout */
|
469 |
+
b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 16});
|
470 |
+
/******************** Bottleneck 2 *******************/
|
471 |
+
/* H W KH KW PH PW S D GCin GCout */
|
472 |
+
b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 64});
|
473 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 24});
|
474 |
+
/******************** Bottleneck 3 *******************/
|
475 |
+
/* H W KH KW PH PW S D GCin GCout */
|
476 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 72});
|
477 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 72, 24});
|
478 |
+
/******************** Bottleneck 4 *******************/
|
479 |
+
/* H W KH KW PH PW S D GCin GCout */
|
480 |
+
//b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 72});*/
|
481 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 72, 24});
|
482 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 24, 72});
|
483 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 72, 40});
|
484 |
+
/******************** Bottleneck 5 *******************/
|
485 |
+
/* H W KH KW PH PW S D GCin GCout */
|
486 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 120});
|
487 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
|
488 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
|
489 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 120, 40});
|
490 |
+
/******************** Bottleneck 6 *******************/
|
491 |
+
/* H W KH KW PH PW S D GCin GCout */
|
492 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 120});
|
493 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
|
494 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
|
495 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 120, 40});
|
496 |
+
/******************** Bottleneck 7 *******************/
|
497 |
+
/* H W KH KW PH PW S D GCin GCout */
|
498 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 240});
|
499 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 80});
|
500 |
+
/******************** Bottleneck 8 *******************/
|
501 |
+
/* H W KH KW PH PW S D GCin GCout */
|
502 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 200});
|
503 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 80});
|
504 |
+
/******************** Bottleneck 9 *******************/
|
505 |
+
/* H W KH KW PH PW S D GCin GCout */
|
506 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 184});
|
507 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 184, 80});
|
508 |
+
/******************* Bottleneck 10 *******************/
|
509 |
+
/* H W KH KW PH PW S D GCin GCout */
|
510 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 184});
|
511 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 184, 80});
|
512 |
+
/******************* Bottleneck 11 *******************/
|
513 |
+
/* H W KH KW PH PW S D GCin GCout */
|
514 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 480});
|
515 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 480, 120});
|
516 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 480});
|
517 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 480, 112});
|
518 |
+
/******************* Bottleneck 12 *******************/
|
519 |
+
/* H W KH KW PH PW S D GCin GCout */
|
520 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 112, 672});
|
521 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 672, 168});
|
522 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 168, 672});
|
523 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 672, 112});
|
524 |
+
/******************* Bottleneck 13 *******************/
|
525 |
+
/* H W KH KW PH PW S D GCin GCout */
|
526 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 112, 672});
|
527 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 672, 160});
|
528 |
+
/******************* Bottleneck 14 *******************/
|
529 |
+
/* H W KH KW PH PW S D GCin GCout */
|
530 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
531 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 240});
|
532 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 960});
|
533 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
|
534 |
+
/******************* Bottleneck 15 *******************/
|
535 |
+
/* H W KH KW PH PW S D GCin GCout */
|
536 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
537 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 240});
|
538 |
+
//b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 960});
|
539 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
|
540 |
+
/******************** Last Stage *********************/
|
541 |
+
/* H W KH KW PH PW S D GCin GCout */
|
542 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
|
543 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 1280});
|
544 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1280, 1001});
|
545 |
+
}
|
546 |
+
|
547 |
+
// SqueezeNet 1.0
|
548 |
+
static void SqueezeNetV10ConvArguments(benchmark::internal::Benchmark* b) {
|
549 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
550 |
+
|
551 |
+
/*********************** Conv 1 **********************/
|
552 |
+
/* H W KH KW PH PW S D GCin GCout */
|
553 |
+
b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 96});
|
554 |
+
/*********************** Fire 2 **********************/
|
555 |
+
/* H W KH KW PH PW S D GCin GCout */
|
556 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 96, 16});
|
557 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
|
558 |
+
b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
|
559 |
+
/*********************** Fire 3 **********************/
|
560 |
+
/* H W KH KW PH PW S D GCin GCout */
|
561 |
+
b->Args({ 56, 55, 1, 1, 0, 0, 1, 1, 128, 16});
|
562 |
+
//b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
|
563 |
+
//b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
|
564 |
+
/*********************** Fire 4 **********************/
|
565 |
+
/* H W KH KW PH PW S D GCin GCout */
|
566 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 128, 32});
|
567 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 32, 128});
|
568 |
+
b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 32, 128});
|
569 |
+
/*********************** Fire 5 **********************/
|
570 |
+
/* H W KH KW PH PW S D GCin GCout */
|
571 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 32});
|
572 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
|
573 |
+
b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
|
574 |
+
/*********************** Fire 6 **********************/
|
575 |
+
/* H W KH KW PH PW S D GCin GCout */
|
576 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 48});
|
577 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 48, 192});
|
578 |
+
b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 48, 192});
|
579 |
+
/*********************** Fire 7 **********************/
|
580 |
+
/* H W KH KW PH PW S D GCin GCout */
|
581 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 384, 48});
|
582 |
+
//b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 48, 192});
|
583 |
+
//b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 48, 192});
|
584 |
+
/*********************** Fire 8 **********************/
|
585 |
+
/* H W KH KW PH PW S D GCin GCout */
|
586 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 384, 64});
|
587 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 64, 256});
|
588 |
+
b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 64, 256});
|
589 |
+
/*********************** Fire 9 **********************/
|
590 |
+
/* H W KH KW PH PW S D GCin GCout */
|
591 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 64});
|
592 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
|
593 |
+
b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
|
594 |
+
/********************** Conv 10 **********************/
|
595 |
+
/* H W KH KW PH PW S D GCin GCout */
|
596 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 1000});
|
597 |
+
}
|
598 |
+
|
599 |
+
// SqueezeNet 1.1
|
600 |
+
static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
|
601 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
602 |
+
|
603 |
+
/*********************** Conv 1 **********************/
|
604 |
+
/* H W KH KW PH PW S D GCin GCout */
|
605 |
+
b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 64});
|
606 |
+
/*********************** Fire 2 **********************/
|
607 |
+
/* H W KH KW PH PW S D GCin GCout */
|
608 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 64, 16});
|
609 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
|
610 |
+
b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
|
611 |
+
/*********************** Fire 3 **********************/
|
612 |
+
/* H W KH KW PH PW S D GCin GCout */
|
613 |
+
b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 128, 16});
|
614 |
+
//b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
|
615 |
+
//b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
|
616 |
+
/*********************** Fire 4 **********************/
|
617 |
+
/* H W KH KW PH PW S D GCin GCout */
|
618 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 128, 32});
|
619 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
|
620 |
+
b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
|
621 |
+
/*********************** Fire 5 **********************/
|
622 |
+
/* H W KH KW PH PW S D GCin GCout */
|
623 |
+
b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 32});
|
624 |
+
//b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
|
625 |
+
//b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
|
626 |
+
/*********************** Fire 6 **********************/
|
627 |
+
/* H W KH KW PH PW S D GCin GCout */
|
628 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 256, 48});
|
629 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 48, 192});
|
630 |
+
b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 48, 192});
|
631 |
+
/*********************** Fire 7 **********************/
|
632 |
+
/* H W KH KW PH PW S D GCin GCout */
|
633 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 384, 48});
|
634 |
+
//b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 48, 192});
|
635 |
+
//b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 48, 192});
|
636 |
+
/*********************** Fire 8 **********************/
|
637 |
+
/* H W KH KW PH PW S D GCin GCout */
|
638 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 384, 64});
|
639 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
|
640 |
+
b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
|
641 |
+
/*********************** Fire 9 **********************/
|
642 |
+
/* H W KH KW PH PW S D GCin GCout */
|
643 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 64});
|
644 |
+
//b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
|
645 |
+
//b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
|
646 |
+
/********************** Conv 10 **********************/
|
647 |
+
/* H W KH KW PH PW S D GCin GCout */
|
648 |
+
b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 1000});
|
649 |
+
}
|
650 |
+
|
651 |
+
static void InceptionV3ConvArguments(benchmark::internal::Benchmark* b) {
|
652 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
653 |
+
|
654 |
+
/* H W KH KW PH PW S D GCin GCout */
|
655 |
+
b->Args({299, 299, 3, 3, 0, 0, 2, 1, 3, 32});
|
656 |
+
b->Args({149, 149, 3, 3, 0, 0, 1, 1, 32, 32});
|
657 |
+
b->Args({147, 147, 3, 3, 2, 2, 1, 1, 32, 64});
|
658 |
+
b->Args({ 73, 73, 1, 1, 0, 0, 1, 1, 64, 80});
|
659 |
+
b->Args({ 73, 73, 3, 3, 0, 0, 1, 1, 80, 192});
|
660 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 64});
|
661 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 48});
|
662 |
+
b->Args({ 35, 35, 5, 5, 4, 4, 1, 1, 48, 64});
|
663 |
+
b->Args({ 35, 35, 3, 3, 2, 2, 1, 1, 64, 96});
|
664 |
+
b->Args({ 35, 35, 3, 3, 2, 2, 1, 1, 96, 96});
|
665 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 32});
|
666 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 256, 64});
|
667 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 256, 48});
|
668 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 288, 64});
|
669 |
+
b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 288, 48});
|
670 |
+
b->Args({ 35, 35, 3, 3, 0, 0, 2, 1, 288, 384});
|
671 |
+
b->Args({ 35, 35, 3, 3, 0, 0, 2, 1, 96, 96});
|
672 |
+
b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 192});
|
673 |
+
b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 128});
|
674 |
+
b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 128, 128});
|
675 |
+
b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 128, 192});
|
676 |
+
b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 128, 128});
|
677 |
+
b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 128, 192});
|
678 |
+
b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 160});
|
679 |
+
b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 160, 160});
|
680 |
+
b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 160, 192});
|
681 |
+
b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 160, 160});
|
682 |
+
b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 160, 192});
|
683 |
+
b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 192, 192});
|
684 |
+
b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 192, 192});
|
685 |
+
b->Args({ 17, 17, 3, 3, 0, 0, 2, 1, 192, 320});
|
686 |
+
b->Args({ 17, 17, 3, 3, 0, 0, 2, 1, 192, 192});
|
687 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 320});
|
688 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 384});
|
689 |
+
b->Args({ 8, 8, 1, 3, 0, 2, 1, 1, 384, 384});
|
690 |
+
b->Args({ 8, 8, 3, 1, 2, 0, 1, 1, 384, 384});
|
691 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 448});
|
692 |
+
b->Args({ 8, 8, 3, 3, 2, 2, 1, 1, 448, 384});
|
693 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 192});
|
694 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 320});
|
695 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 384});
|
696 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 448});
|
697 |
+
b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 192});
|
698 |
+
b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 2048, 1001});
|
699 |
+
}
|
700 |
+
|
701 |
+
static void ResNet18ConvArguments(benchmark::internal::Benchmark* b) {
|
702 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
703 |
+
|
704 |
+
/********************** Conv 1 ***********************/
|
705 |
+
/* H W KH KW PH PW S D GCin GCout */
|
706 |
+
b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 64});
|
707 |
+
/********************* Conv 2.X **********************/
|
708 |
+
/* H W KH KW PH PW S D GCin GCout */
|
709 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
|
710 |
+
/********************* Conv 3.X **********************/
|
711 |
+
/* H W KH KW PH PW S D GCin GCout */
|
712 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 64, 128});
|
713 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 128, 128});
|
714 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 2, 1, 64, 128});
|
715 |
+
/********************* Conv 4.X **********************/
|
716 |
+
/* H W KH KW PH PW S D GCin GCout */
|
717 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 128, 256});
|
718 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 256, 256});
|
719 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 2, 1, 128, 256});
|
720 |
+
/********************* Conv 5.X **********************/
|
721 |
+
/* H W KH KW PH PW S D GCin GCout */
|
722 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 256, 512});
|
723 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 512, 512});
|
724 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 2, 1, 256, 512});
|
725 |
+
}
|
726 |
+
|
727 |
+
static void ResNet50ConvArguments(benchmark::internal::Benchmark* b) {
|
728 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
729 |
+
|
730 |
+
/********************** Conv 1 ***********************/
|
731 |
+
/* H W KH KW PH PW S D GCin GCout */
|
732 |
+
b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 64});
|
733 |
+
/********************* Conv 2.1 **********************/
|
734 |
+
/* H W KH KW PH PW S D GCin GCout */
|
735 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 64});
|
736 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
|
737 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
|
738 |
+
//b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
|
739 |
+
/********************* Conv 2.X **********************/
|
740 |
+
/* H W KH KW PH PW S D GCin GCout */
|
741 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 64});
|
742 |
+
//b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
|
743 |
+
//b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
|
744 |
+
/********************** Conv 3.1 *********************/
|
745 |
+
/* H W KH KW PH PW S D GCin GCout */
|
746 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 128});
|
747 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 128, 128});
|
748 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 512});
|
749 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 2, 1, 256, 512});
|
750 |
+
/********************** Conv 3.X *********************/
|
751 |
+
/* H W KH KW PH PW S D GCin GCout */
|
752 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 128});
|
753 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 128, 128});
|
754 |
+
//b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 512});
|
755 |
+
/********************** Conv 4.1 *********************/
|
756 |
+
/* H W KH KW PH PW S D GCin GCout */
|
757 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 256});
|
758 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 256, 256});
|
759 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 1024});
|
760 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 2, 1, 512, 1024});
|
761 |
+
/********************** Conv 4.X *********************/
|
762 |
+
/* H W KH KW PH PW S D GCin GCout */
|
763 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 1024, 256});
|
764 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 256, 256});
|
765 |
+
//b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 1024});
|
766 |
+
/********************** Conv 5.1 *********************/
|
767 |
+
/* H W KH KW PH PW S D GCin GCout */
|
768 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 1024, 512});
|
769 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 512, 512});
|
770 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 2048});
|
771 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 2, 1, 1024, 2048});
|
772 |
+
/********************** Conv 5.X *********************/
|
773 |
+
/* H W KH KW PH PW S D GCin GCout */
|
774 |
+
b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 2048, 512});
|
775 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 512, 512});
|
776 |
+
//b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 2048});
|
777 |
+
}
|
778 |
+
|
779 |
+
static void VGGConvArguments(benchmark::internal::Benchmark* b) {
|
780 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
781 |
+
|
782 |
+
/********************** Conv 1.1 *********************/
|
783 |
+
/* H W KH KW PH PW S D GCin GCout */
|
784 |
+
b->Args({224, 224, 3, 3, 2, 2, 1, 1, 3, 64});
|
785 |
+
/********************** Conv 1.2 *********************/
|
786 |
+
/* H W KH KW PH PW S D GCin GCout */
|
787 |
+
b->Args({224, 224, 3, 3, 2, 2, 1, 1, 64, 64});
|
788 |
+
|
789 |
+
/********************** Conv 2.1 *********************/
|
790 |
+
/* H W KH KW PH PW S D GCin GCout */
|
791 |
+
b->Args({112, 112, 3, 3, 2, 2, 1, 1, 64, 128});
|
792 |
+
/********************** Conv 2.2 *********************/
|
793 |
+
/* H W KH KW PH PW S D GCin GCout */
|
794 |
+
b->Args({112, 112, 3, 3, 2, 2, 1, 1, 128, 128});
|
795 |
+
|
796 |
+
/********************** Conv 3.1 *********************/
|
797 |
+
/* H W KH KW PH PW S D GCin GCout */
|
798 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 128, 256});
|
799 |
+
/********************** Conv 3.2 *********************/
|
800 |
+
/* H W KH KW PH PW S D GCin GCout */
|
801 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 256, 256});
|
802 |
+
/********************** Conv 3.3 *********************/
|
803 |
+
/* H W KH KW PH PW S D GCin GCout */
|
804 |
+
b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 256});
|
805 |
+
|
806 |
+
/********************** Conv 4.1 *********************/
|
807 |
+
/* H W KH KW PH PW S D GCin GCout */
|
808 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 256, 512});
|
809 |
+
/********************** Conv 4.2 *********************/
|
810 |
+
/* H W KH KW PH PW S D GCin GCout */
|
811 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 512, 512});
|
812 |
+
/********************** Conv 4.3 *********************/
|
813 |
+
/* H W KH KW PH PW S D GCin GCout */
|
814 |
+
b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 512});
|
815 |
+
|
816 |
+
/********************** Conv 5.X *********************/
|
817 |
+
/* H W KH KW PH PW S D GCin GCout */
|
818 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 512, 512});
|
819 |
+
/********************** Conv 5.3 *********************/
|
820 |
+
/* H W KH KW PH PW S D GCin GCout */
|
821 |
+
b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 512, 512});
|
822 |
+
}
|
823 |
+
|
824 |
+
// SRCNN (9-1-5)
|
825 |
+
static void SRCNN915ConvArguments(benchmark::internal::Benchmark* b) {
|
826 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
827 |
+
|
828 |
+
/* H W KH KW PH PW S D GCin GCout */
|
829 |
+
b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
|
830 |
+
b->Args({376, 376, 1, 1, 0, 0, 1, 1, 64, 32});
|
831 |
+
b->Args({376, 376, 5, 5, 0, 0, 1, 1, 32, 1});
|
832 |
+
}
|
833 |
+
|
834 |
+
// SRCNN (9-3-5)
|
835 |
+
static void SRCNN935ConvArguments(benchmark::internal::Benchmark* b) {
|
836 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
837 |
+
|
838 |
+
/* H W KH KW PH PW S D GCin GCout */
|
839 |
+
b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
|
840 |
+
b->Args({376, 376, 3, 3, 0, 0, 1, 1, 64, 32});
|
841 |
+
b->Args({374, 374, 5, 5, 0, 0, 1, 1, 32, 1});
|
842 |
+
}
|
843 |
+
|
844 |
+
// SRCNN (9-5-5)
|
845 |
+
static void SRCNN955ConvArguments(benchmark::internal::Benchmark* b) {
|
846 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
|
847 |
+
|
848 |
+
/* H W KH KW PH PW S D GCin GCout */
|
849 |
+
b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
|
850 |
+
b->Args({376, 376, 5, 5, 0, 0, 1, 1, 64, 32});
|
851 |
+
b->Args({372, 372, 5, 5, 0, 0, 1, 1, 32, 1});
|
852 |
+
}
|
bench/convert.cc
ADDED
@@ -0,0 +1,1339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2021 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cfloat>
|
9 |
+
#include <cmath>
|
10 |
+
#include <functional>
|
11 |
+
#include <limits>
|
12 |
+
#include <memory>
|
13 |
+
#include <random>
|
14 |
+
#include <vector>
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
|
18 |
+
#include <benchmark/benchmark.h>
|
19 |
+
#include <fp16/fp16.h>
|
20 |
+
#include "bench/utils.h"
|
21 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
22 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
23 |
+
#include "tensorflow/lite/interpreter.h"
|
24 |
+
#include "tensorflow/lite/kernels/register.h"
|
25 |
+
#include "tensorflow/lite/model.h"
|
26 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
27 |
+
#include "tensorflow/lite/version.h"
|
28 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
29 |
+
|
30 |
+
|
31 |
+
void xnnpack_convert_f16_f32(benchmark::State& state) {
|
32 |
+
const size_t batch_size = state.range(0);
|
33 |
+
|
34 |
+
std::random_device random_device;
|
35 |
+
auto rng = std::mt19937(random_device());
|
36 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
37 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
38 |
+
|
39 |
+
std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
40 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
41 |
+
std::vector<float> output(batch_size);
|
42 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
43 |
+
|
44 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
45 |
+
if (status != xnn_status_success) {
|
46 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
47 |
+
return;
|
48 |
+
}
|
49 |
+
|
50 |
+
xnn_operator_t convert_op = nullptr;
|
51 |
+
status = xnn_create_convert_nc_f16_f32(
|
52 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
53 |
+
0 /* flags */, &convert_op);
|
54 |
+
if (status != xnn_status_success) {
|
55 |
+
state.SkipWithError("failed to create F16->F32 Convert operator");
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
|
59 |
+
status = xnn_reshape_convert_nc_f16_f32(convert_op, batch_size, /*threadpool=*/nullptr);
|
60 |
+
if (status != xnn_status_success) {
|
61 |
+
state.SkipWithError("failed to reshape F16->F32 Convert operator");
|
62 |
+
return;
|
63 |
+
}
|
64 |
+
|
65 |
+
status = xnn_setup_convert_nc_f16_f32(convert_op, input.data(), output.data());
|
66 |
+
if (status != xnn_status_success) {
|
67 |
+
state.SkipWithError("failed to setup F16->F32 Convert operator");
|
68 |
+
return;
|
69 |
+
}
|
70 |
+
|
71 |
+
for (auto _ : state) {
|
72 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
73 |
+
if (status != xnn_status_success) {
|
74 |
+
state.SkipWithError("failed to run F16->F32 Convert operator");
|
75 |
+
return;
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
status = xnn_delete_operator(convert_op);
|
80 |
+
if (status != xnn_status_success) {
|
81 |
+
state.SkipWithError("failed to delete F16->F32 Convert operator");
|
82 |
+
return;
|
83 |
+
}
|
84 |
+
convert_op = nullptr;
|
85 |
+
|
86 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
87 |
+
if (cpu_frequency != 0) {
|
88 |
+
state.counters["cpufreq"] = cpu_frequency;
|
89 |
+
}
|
90 |
+
|
91 |
+
state.counters["elements"] =
|
92 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
93 |
+
|
94 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
|
95 |
+
state.counters["bytes"] =
|
96 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
97 |
+
}
|
98 |
+
|
99 |
+
void xnnpack_convert_f32_f16(benchmark::State& state) {
|
100 |
+
const size_t batch_size = state.range(0);
|
101 |
+
|
102 |
+
std::random_device random_device;
|
103 |
+
auto rng = std::mt19937(random_device());
|
104 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
105 |
+
|
106 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
107 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
108 |
+
std::vector<uint16_t> output(batch_size);
|
109 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
110 |
+
|
111 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
112 |
+
if (status != xnn_status_success) {
|
113 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
114 |
+
return;
|
115 |
+
}
|
116 |
+
|
117 |
+
xnn_operator_t convert_op = nullptr;
|
118 |
+
status = xnn_create_convert_nc_f32_f16(
|
119 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
120 |
+
0 /* flags */, &convert_op);
|
121 |
+
if (status != xnn_status_success) {
|
122 |
+
state.SkipWithError("failed to create F32->F16 Convert operator");
|
123 |
+
return;
|
124 |
+
}
|
125 |
+
|
126 |
+
status = xnn_reshape_convert_nc_f32_f16(convert_op, batch_size, /*threadpool=*/nullptr);
|
127 |
+
if (status != xnn_status_success) {
|
128 |
+
state.SkipWithError("failed to reshape F32->F16 Convert operator");
|
129 |
+
return;
|
130 |
+
}
|
131 |
+
|
132 |
+
status = xnn_setup_convert_nc_f32_f16(convert_op, input.data(), output.data());
|
133 |
+
if (status != xnn_status_success) {
|
134 |
+
state.SkipWithError("failed to setup F32->F16 Convert operator");
|
135 |
+
return;
|
136 |
+
}
|
137 |
+
|
138 |
+
for (auto _ : state) {
|
139 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
140 |
+
if (status != xnn_status_success) {
|
141 |
+
state.SkipWithError("failed to run F32->F16 Convert operator");
|
142 |
+
return;
|
143 |
+
}
|
144 |
+
}
|
145 |
+
|
146 |
+
status = xnn_delete_operator(convert_op);
|
147 |
+
if (status != xnn_status_success) {
|
148 |
+
state.SkipWithError("failed to delete F32->F16 Convert operator");
|
149 |
+
return;
|
150 |
+
}
|
151 |
+
convert_op = nullptr;
|
152 |
+
|
153 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
154 |
+
if (cpu_frequency != 0) {
|
155 |
+
state.counters["cpufreq"] = cpu_frequency;
|
156 |
+
}
|
157 |
+
|
158 |
+
state.counters["elements"] =
|
159 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
160 |
+
|
161 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint16_t));
|
162 |
+
state.counters["bytes"] =
|
163 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
164 |
+
}
|
165 |
+
|
166 |
+
void xnnpack_convert_f32_qs8(benchmark::State& state) {
|
167 |
+
const size_t batch_size = state.range(0);
|
168 |
+
|
169 |
+
std::random_device random_device;
|
170 |
+
auto rng = std::mt19937(random_device());
|
171 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
172 |
+
|
173 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
174 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
175 |
+
std::vector<int8_t> output(batch_size);
|
176 |
+
std::fill(output.begin(), output.end(), 0);
|
177 |
+
|
178 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
179 |
+
if (status != xnn_status_success) {
|
180 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
181 |
+
return;
|
182 |
+
}
|
183 |
+
|
184 |
+
xnn_operator_t convert_op = nullptr;
|
185 |
+
status = xnn_create_convert_nc_f32_qs8(
|
186 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
187 |
+
1.0f / 128.0f /* scale */, 1 /* zero point */,
|
188 |
+
std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
|
189 |
+
0 /* flags */, &convert_op);
|
190 |
+
if (status != xnn_status_success) {
|
191 |
+
state.SkipWithError("failed to create F32->QS8 Convert operator");
|
192 |
+
return;
|
193 |
+
}
|
194 |
+
|
195 |
+
status = xnn_reshape_convert_nc_f32_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
|
196 |
+
if (status != xnn_status_success) {
|
197 |
+
state.SkipWithError("failed to reshape F32->QS8 Convert operator");
|
198 |
+
return;
|
199 |
+
}
|
200 |
+
|
201 |
+
status = xnn_setup_convert_nc_f32_qs8(convert_op, input.data(), output.data());
|
202 |
+
if (status != xnn_status_success) {
|
203 |
+
state.SkipWithError("failed to setup F32->QS8 Convert operator");
|
204 |
+
return;
|
205 |
+
}
|
206 |
+
|
207 |
+
for (auto _ : state) {
|
208 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
209 |
+
if (status != xnn_status_success) {
|
210 |
+
state.SkipWithError("failed to run F32->QS8 Convert operator");
|
211 |
+
return;
|
212 |
+
}
|
213 |
+
}
|
214 |
+
|
215 |
+
status = xnn_delete_operator(convert_op);
|
216 |
+
if (status != xnn_status_success) {
|
217 |
+
state.SkipWithError("failed to delete F32->QS8 Convert operator");
|
218 |
+
return;
|
219 |
+
}
|
220 |
+
convert_op = nullptr;
|
221 |
+
|
222 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
223 |
+
if (cpu_frequency != 0) {
|
224 |
+
state.counters["cpufreq"] = cpu_frequency;
|
225 |
+
}
|
226 |
+
|
227 |
+
state.counters["elements"] =
|
228 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
229 |
+
|
230 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
|
231 |
+
state.counters["bytes"] =
|
232 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
233 |
+
}
|
234 |
+
|
235 |
+
void xnnpack_convert_f32_qu8(benchmark::State& state) {
|
236 |
+
const size_t batch_size = state.range(0);
|
237 |
+
|
238 |
+
std::random_device random_device;
|
239 |
+
auto rng = std::mt19937(random_device());
|
240 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
241 |
+
|
242 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
243 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
244 |
+
std::vector<uint8_t> output(batch_size);
|
245 |
+
std::fill(output.begin(), output.end(), 0);
|
246 |
+
|
247 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
248 |
+
if (status != xnn_status_success) {
|
249 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
250 |
+
return;
|
251 |
+
}
|
252 |
+
|
253 |
+
xnn_operator_t convert_op = nullptr;
|
254 |
+
status = xnn_create_convert_nc_f32_qu8(
|
255 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
256 |
+
1.0f / 128.0f /* scale */, 127 /* zero point */,
|
257 |
+
std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max(),
|
258 |
+
0 /* flags */, &convert_op);
|
259 |
+
if (status != xnn_status_success) {
|
260 |
+
state.SkipWithError("failed to create F32->QU8 Convert operator");
|
261 |
+
return;
|
262 |
+
}
|
263 |
+
|
264 |
+
status = xnn_reshape_convert_nc_f32_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
|
265 |
+
if (status != xnn_status_success) {
|
266 |
+
state.SkipWithError("failed to reshape F32->QU8 Convert operator");
|
267 |
+
return;
|
268 |
+
}
|
269 |
+
|
270 |
+
status = xnn_setup_convert_nc_f32_qu8(convert_op, input.data(), output.data());
|
271 |
+
if (status != xnn_status_success) {
|
272 |
+
state.SkipWithError("failed to setup F32->QU8 Convert operator");
|
273 |
+
return;
|
274 |
+
}
|
275 |
+
|
276 |
+
for (auto _ : state) {
|
277 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
278 |
+
if (status != xnn_status_success) {
|
279 |
+
state.SkipWithError("failed to run F32->QU8 Convert operator");
|
280 |
+
return;
|
281 |
+
}
|
282 |
+
}
|
283 |
+
|
284 |
+
status = xnn_delete_operator(convert_op);
|
285 |
+
if (status != xnn_status_success) {
|
286 |
+
state.SkipWithError("failed to delete F32->QU8 Convert operator");
|
287 |
+
return;
|
288 |
+
}
|
289 |
+
convert_op = nullptr;
|
290 |
+
|
291 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
292 |
+
if (cpu_frequency != 0) {
|
293 |
+
state.counters["cpufreq"] = cpu_frequency;
|
294 |
+
}
|
295 |
+
|
296 |
+
state.counters["elements"] =
|
297 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
298 |
+
|
299 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
|
300 |
+
state.counters["bytes"] =
|
301 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
302 |
+
}
|
303 |
+
|
304 |
+
void xnnpack_convert_qs8(benchmark::State& state) {
|
305 |
+
const size_t batch_size = state.range(0);
|
306 |
+
|
307 |
+
std::random_device random_device;
|
308 |
+
auto rng = std::mt19937(random_device());
|
309 |
+
auto i8rng = std::bind(
|
310 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
311 |
+
std::ref(rng));
|
312 |
+
|
313 |
+
std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
|
314 |
+
std::generate(input.begin(), input.end(), std::ref(i8rng));
|
315 |
+
std::vector<int8_t> output(batch_size);
|
316 |
+
std::fill(output.begin(), output.end(), INT8_C(0xAA));
|
317 |
+
|
318 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
319 |
+
if (status != xnn_status_success) {
|
320 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
321 |
+
return;
|
322 |
+
}
|
323 |
+
|
324 |
+
xnn_operator_t convert_op = nullptr;
|
325 |
+
status = xnn_create_convert_nc_qs8(
|
326 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
327 |
+
0.75f /* input scale */, -1 /* input zero point */,
|
328 |
+
0.5f /* output scale */, 1 /* output zero point */,
|
329 |
+
0 /* flags */, &convert_op);
|
330 |
+
if (status != xnn_status_success) {
|
331 |
+
state.SkipWithError("failed to create QS8 Convert operator");
|
332 |
+
return;
|
333 |
+
}
|
334 |
+
|
335 |
+
status = xnn_reshape_convert_nc_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
|
336 |
+
if (status != xnn_status_success) {
|
337 |
+
state.SkipWithError("failed to reshape QS8 Convert operator");
|
338 |
+
return;
|
339 |
+
}
|
340 |
+
|
341 |
+
status = xnn_setup_convert_nc_qs8(convert_op, input.data(), output.data());
|
342 |
+
if (status != xnn_status_success) {
|
343 |
+
state.SkipWithError("failed to setup QS8 Convert operator");
|
344 |
+
return;
|
345 |
+
}
|
346 |
+
|
347 |
+
for (auto _ : state) {
|
348 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
349 |
+
if (status != xnn_status_success) {
|
350 |
+
state.SkipWithError("failed to run QS8 Convert operator");
|
351 |
+
return;
|
352 |
+
}
|
353 |
+
}
|
354 |
+
|
355 |
+
status = xnn_delete_operator(convert_op);
|
356 |
+
if (status != xnn_status_success) {
|
357 |
+
state.SkipWithError("failed to delete QS8 Convert operator");
|
358 |
+
return;
|
359 |
+
}
|
360 |
+
convert_op = nullptr;
|
361 |
+
|
362 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
363 |
+
if (cpu_frequency != 0) {
|
364 |
+
state.counters["cpufreq"] = cpu_frequency;
|
365 |
+
}
|
366 |
+
|
367 |
+
state.counters["elements"] =
|
368 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
369 |
+
|
370 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
|
371 |
+
state.counters["bytes"] =
|
372 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
373 |
+
}
|
374 |
+
|
375 |
+
void xnnpack_convert_qs8_f32(benchmark::State& state) {
|
376 |
+
const size_t batch_size = state.range(0);
|
377 |
+
|
378 |
+
std::random_device random_device;
|
379 |
+
auto rng = std::mt19937(random_device());
|
380 |
+
auto i8rng = std::bind(
|
381 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
382 |
+
std::ref(rng));
|
383 |
+
|
384 |
+
std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
|
385 |
+
std::generate(input.begin(), input.end(), std::ref(i8rng));
|
386 |
+
std::vector<float> output(batch_size);
|
387 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
388 |
+
|
389 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
390 |
+
if (status != xnn_status_success) {
|
391 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
392 |
+
return;
|
393 |
+
}
|
394 |
+
|
395 |
+
xnn_operator_t convert_op = nullptr;
|
396 |
+
status = xnn_create_convert_nc_qs8_f32(
|
397 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
398 |
+
1.0f / 255.0f /* scale */, -128 /* zero point */,
|
399 |
+
0 /* flags */, &convert_op);
|
400 |
+
if (status != xnn_status_success) {
|
401 |
+
state.SkipWithError("failed to create QS8->F32 Convert operator");
|
402 |
+
return;
|
403 |
+
}
|
404 |
+
|
405 |
+
status = xnn_reshape_convert_nc_qs8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
|
406 |
+
if (status != xnn_status_success) {
|
407 |
+
state.SkipWithError("failed to reshape QS8->F32 Convert operator");
|
408 |
+
return;
|
409 |
+
}
|
410 |
+
|
411 |
+
status = xnn_setup_convert_nc_qs8_f32(convert_op, input.data(), output.data());
|
412 |
+
if (status != xnn_status_success) {
|
413 |
+
state.SkipWithError("failed to setup QS8->F32 Convert operator");
|
414 |
+
return;
|
415 |
+
}
|
416 |
+
|
417 |
+
for (auto _ : state) {
|
418 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
419 |
+
if (status != xnn_status_success) {
|
420 |
+
state.SkipWithError("failed to run QS8->F32 Convert operator");
|
421 |
+
return;
|
422 |
+
}
|
423 |
+
}
|
424 |
+
|
425 |
+
status = xnn_delete_operator(convert_op);
|
426 |
+
if (status != xnn_status_success) {
|
427 |
+
state.SkipWithError("failed to delete QS8->F32 Convert operator");
|
428 |
+
return;
|
429 |
+
}
|
430 |
+
convert_op = nullptr;
|
431 |
+
|
432 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
433 |
+
if (cpu_frequency != 0) {
|
434 |
+
state.counters["cpufreq"] = cpu_frequency;
|
435 |
+
}
|
436 |
+
|
437 |
+
state.counters["elements"] =
|
438 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
439 |
+
|
440 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
|
441 |
+
state.counters["bytes"] =
|
442 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
443 |
+
}
|
444 |
+
|
445 |
+
void xnnpack_convert_qu8(benchmark::State& state) {
|
446 |
+
const size_t batch_size = state.range(0);
|
447 |
+
|
448 |
+
std::random_device random_device;
|
449 |
+
auto rng = std::mt19937(random_device());
|
450 |
+
auto u8rng = std::bind(
|
451 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
|
452 |
+
std::ref(rng));
|
453 |
+
|
454 |
+
std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
|
455 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
456 |
+
std::vector<uint8_t> output(batch_size);
|
457 |
+
std::fill(output.begin(), output.end(), UINT8_C(0xAA));
|
458 |
+
|
459 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
460 |
+
if (status != xnn_status_success) {
|
461 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
462 |
+
return;
|
463 |
+
}
|
464 |
+
|
465 |
+
xnn_operator_t convert_op = nullptr;
|
466 |
+
status = xnn_create_convert_nc_qu8(
|
467 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
468 |
+
0.75f /* scale */, 125 /* zero point */,
|
469 |
+
0.5f /* scale */, 130 /* zero point */,
|
470 |
+
0 /* flags */, &convert_op);
|
471 |
+
if (status != xnn_status_success) {
|
472 |
+
state.SkipWithError("failed to create QU8 Convert operator");
|
473 |
+
return;
|
474 |
+
}
|
475 |
+
|
476 |
+
status = xnn_reshape_convert_nc_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
|
477 |
+
if (status != xnn_status_success) {
|
478 |
+
state.SkipWithError("failed to reshape QU8 Convert operator");
|
479 |
+
return;
|
480 |
+
}
|
481 |
+
|
482 |
+
status = xnn_setup_convert_nc_qu8(convert_op, input.data(), output.data());
|
483 |
+
if (status != xnn_status_success) {
|
484 |
+
state.SkipWithError("failed to setup QU8 Convert operator");
|
485 |
+
return;
|
486 |
+
}
|
487 |
+
|
488 |
+
for (auto _ : state) {
|
489 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
490 |
+
if (status != xnn_status_success) {
|
491 |
+
state.SkipWithError("failed to run QU8 Convert operator");
|
492 |
+
return;
|
493 |
+
}
|
494 |
+
}
|
495 |
+
|
496 |
+
status = xnn_delete_operator(convert_op);
|
497 |
+
if (status != xnn_status_success) {
|
498 |
+
state.SkipWithError("failed to delete QU8 Convert operator");
|
499 |
+
return;
|
500 |
+
}
|
501 |
+
convert_op = nullptr;
|
502 |
+
|
503 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
504 |
+
if (cpu_frequency != 0) {
|
505 |
+
state.counters["cpufreq"] = cpu_frequency;
|
506 |
+
}
|
507 |
+
|
508 |
+
state.counters["elements"] =
|
509 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
510 |
+
|
511 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
|
512 |
+
state.counters["bytes"] =
|
513 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
514 |
+
}
|
515 |
+
|
516 |
+
void xnnpack_convert_qu8_f32(benchmark::State& state) {
|
517 |
+
const size_t batch_size = state.range(0);
|
518 |
+
|
519 |
+
std::random_device random_device;
|
520 |
+
auto rng = std::mt19937(random_device());
|
521 |
+
auto u8rng = std::bind(
|
522 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
|
523 |
+
std::ref(rng));
|
524 |
+
|
525 |
+
std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
|
526 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
527 |
+
std::vector<float> output(batch_size);
|
528 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
529 |
+
|
530 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
531 |
+
if (status != xnn_status_success) {
|
532 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
533 |
+
return;
|
534 |
+
}
|
535 |
+
|
536 |
+
xnn_operator_t convert_op = nullptr;
|
537 |
+
status = xnn_create_convert_nc_qu8_f32(
|
538 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
539 |
+
1.0f / 128.0f /* scale */, 128 /* zero point */,
|
540 |
+
0 /* flags */, &convert_op);
|
541 |
+
if (status != xnn_status_success) {
|
542 |
+
state.SkipWithError("failed to create QU8->F32 Convert operator");
|
543 |
+
return;
|
544 |
+
}
|
545 |
+
|
546 |
+
status = xnn_reshape_convert_nc_qu8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
|
547 |
+
if (status != xnn_status_success) {
|
548 |
+
state.SkipWithError("failed to reshape QU8->F32 Convert operator");
|
549 |
+
return;
|
550 |
+
}
|
551 |
+
|
552 |
+
status = xnn_setup_convert_nc_qu8_f32(convert_op, input.data(), output.data());
|
553 |
+
if (status != xnn_status_success) {
|
554 |
+
state.SkipWithError("failed to setup QU8->F32 Convert operator");
|
555 |
+
return;
|
556 |
+
}
|
557 |
+
|
558 |
+
for (auto _ : state) {
|
559 |
+
status = xnn_run_operator(convert_op, nullptr /* thread pool */);
|
560 |
+
if (status != xnn_status_success) {
|
561 |
+
state.SkipWithError("failed to run QU8->F32 Convert operator");
|
562 |
+
return;
|
563 |
+
}
|
564 |
+
}
|
565 |
+
|
566 |
+
status = xnn_delete_operator(convert_op);
|
567 |
+
if (status != xnn_status_success) {
|
568 |
+
state.SkipWithError("failed to delete QU8->F32 Convert operator");
|
569 |
+
return;
|
570 |
+
}
|
571 |
+
convert_op = nullptr;
|
572 |
+
|
573 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
574 |
+
if (cpu_frequency != 0) {
|
575 |
+
state.counters["cpufreq"] = cpu_frequency;
|
576 |
+
}
|
577 |
+
|
578 |
+
state.counters["elements"] =
|
579 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
580 |
+
|
581 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
|
582 |
+
state.counters["bytes"] =
|
583 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
584 |
+
}
|
585 |
+
|
586 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
587 |
+
void tflite_convert_f16_f32(benchmark::State& state) {
|
588 |
+
const size_t batch_size = state.range(0);
|
589 |
+
|
590 |
+
std::random_device random_device;
|
591 |
+
auto rng = std::mt19937(random_device());
|
592 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
593 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
594 |
+
|
595 |
+
flatbuffers::FlatBufferBuilder builder;
|
596 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
597 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
|
598 |
+
|
599 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
600 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
601 |
+
}};
|
602 |
+
|
603 |
+
const std::array<int32_t, 1> shape{{
|
604 |
+
static_cast<int32_t>(batch_size)
|
605 |
+
}};
|
606 |
+
|
607 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
608 |
+
tflite::CreateTensor(builder,
|
609 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
610 |
+
tflite::TensorType_FLOAT16),
|
611 |
+
tflite::CreateTensor(builder,
|
612 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
613 |
+
tflite::TensorType_FLOAT32)
|
614 |
+
}};
|
615 |
+
|
616 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
617 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
618 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
619 |
+
0 /* opcode_index */,
|
620 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
621 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
622 |
+
|
623 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
624 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
625 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
626 |
+
builder,
|
627 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
628 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
629 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
630 |
+
builder.CreateVector(&op, 1));
|
631 |
+
|
632 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
|
633 |
+
|
634 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
635 |
+
TFLITE_SCHEMA_VERSION,
|
636 |
+
builder.CreateVector(&operator_code, 1),
|
637 |
+
builder.CreateVector(&subgraph, 1),
|
638 |
+
description,
|
639 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
640 |
+
|
641 |
+
builder.Finish(model_buffer);
|
642 |
+
|
643 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
644 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
645 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
646 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
647 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
648 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
649 |
+
return;
|
650 |
+
}
|
651 |
+
interpreter->SetNumThreads(1);
|
652 |
+
|
653 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
654 |
+
state.SkipWithError("failed to allocate tensors");
|
655 |
+
return;
|
656 |
+
}
|
657 |
+
|
658 |
+
uint16_t* input_data = reinterpret_cast<uint16_t*>(interpreter->tensor(0)->data.data);
|
659 |
+
std::generate_n(input_data, batch_size, std::ref(f16rng));
|
660 |
+
|
661 |
+
for (auto _ : state) {
|
662 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
663 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
664 |
+
return;
|
665 |
+
}
|
666 |
+
}
|
667 |
+
|
668 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
669 |
+
if (cpu_frequency != 0) {
|
670 |
+
state.counters["cpufreq"] = cpu_frequency;
|
671 |
+
}
|
672 |
+
|
673 |
+
state.counters["elements"] =
|
674 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
675 |
+
|
676 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
|
677 |
+
state.counters["bytes"] =
|
678 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
679 |
+
|
680 |
+
interpreter.reset();
|
681 |
+
}
|
682 |
+
|
683 |
+
void tflite_convert_f32_qs8(benchmark::State& state) {
|
684 |
+
const size_t batch_size = state.range(0);
|
685 |
+
|
686 |
+
std::random_device random_device;
|
687 |
+
auto rng = std::mt19937(random_device());
|
688 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
689 |
+
|
690 |
+
flatbuffers::FlatBufferBuilder builder;
|
691 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
692 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
|
693 |
+
|
694 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
695 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
696 |
+
}};
|
697 |
+
|
698 |
+
const std::array<int32_t, 1> shape{{
|
699 |
+
static_cast<int32_t>(batch_size)
|
700 |
+
}};
|
701 |
+
|
702 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
703 |
+
tflite::CreateTensor(builder,
|
704 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
705 |
+
tflite::TensorType_FLOAT32),
|
706 |
+
tflite::CreateTensor(builder,
|
707 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
708 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
709 |
+
tflite::CreateQuantizationParameters(builder,
|
710 |
+
0 /*min*/, 0 /*max*/,
|
711 |
+
builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
|
712 |
+
builder.CreateVector<int64_t>({1 /* zero point */})))
|
713 |
+
}};
|
714 |
+
|
715 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
716 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
717 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
718 |
+
0 /* opcode_index */,
|
719 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
720 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
721 |
+
|
722 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
723 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
724 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
725 |
+
builder,
|
726 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
727 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
728 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
729 |
+
builder.CreateVector(&op, 1));
|
730 |
+
|
731 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
|
732 |
+
|
733 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
734 |
+
TFLITE_SCHEMA_VERSION,
|
735 |
+
builder.CreateVector(&operator_code, 1),
|
736 |
+
builder.CreateVector(&subgraph, 1),
|
737 |
+
description,
|
738 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
739 |
+
|
740 |
+
builder.Finish(model_buffer);
|
741 |
+
|
742 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
743 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
744 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
745 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
746 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
747 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
748 |
+
return;
|
749 |
+
}
|
750 |
+
interpreter->SetNumThreads(1);
|
751 |
+
|
752 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
753 |
+
state.SkipWithError("failed to allocate tensors");
|
754 |
+
return;
|
755 |
+
}
|
756 |
+
|
757 |
+
std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
|
758 |
+
|
759 |
+
for (auto _ : state) {
|
760 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
761 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
762 |
+
return;
|
763 |
+
}
|
764 |
+
}
|
765 |
+
|
766 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
767 |
+
if (cpu_frequency != 0) {
|
768 |
+
state.counters["cpufreq"] = cpu_frequency;
|
769 |
+
}
|
770 |
+
|
771 |
+
state.counters["elements"] =
|
772 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
773 |
+
|
774 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
|
775 |
+
state.counters["bytes"] =
|
776 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
777 |
+
|
778 |
+
interpreter.reset();
|
779 |
+
}
|
780 |
+
|
781 |
+
void tflite_convert_f32_qu8(benchmark::State& state) {
|
782 |
+
const size_t batch_size = state.range(0);
|
783 |
+
|
784 |
+
std::random_device random_device;
|
785 |
+
auto rng = std::mt19937(random_device());
|
786 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
787 |
+
|
788 |
+
flatbuffers::FlatBufferBuilder builder;
|
789 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
790 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
|
791 |
+
|
792 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
793 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
794 |
+
}};
|
795 |
+
|
796 |
+
const std::array<int32_t, 1> shape{{
|
797 |
+
static_cast<int32_t>(batch_size)
|
798 |
+
}};
|
799 |
+
|
800 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
801 |
+
tflite::CreateTensor(builder,
|
802 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
803 |
+
tflite::TensorType_FLOAT32),
|
804 |
+
tflite::CreateTensor(builder,
|
805 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
806 |
+
tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
|
807 |
+
tflite::CreateQuantizationParameters(builder,
|
808 |
+
0 /*min*/, 0 /*max*/,
|
809 |
+
builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
|
810 |
+
builder.CreateVector<int64_t>({127 /* zero point */})))
|
811 |
+
}};
|
812 |
+
|
813 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
814 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
815 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
816 |
+
0 /* opcode_index */,
|
817 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
818 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
819 |
+
|
820 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
821 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
822 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
823 |
+
builder,
|
824 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
825 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
826 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
827 |
+
builder.CreateVector(&op, 1));
|
828 |
+
|
829 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
|
830 |
+
|
831 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
832 |
+
TFLITE_SCHEMA_VERSION,
|
833 |
+
builder.CreateVector(&operator_code, 1),
|
834 |
+
builder.CreateVector(&subgraph, 1),
|
835 |
+
description,
|
836 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
837 |
+
|
838 |
+
builder.Finish(model_buffer);
|
839 |
+
|
840 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
841 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
842 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
843 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
844 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
845 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
846 |
+
return;
|
847 |
+
}
|
848 |
+
interpreter->SetNumThreads(1);
|
849 |
+
|
850 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
851 |
+
state.SkipWithError("failed to allocate tensors");
|
852 |
+
return;
|
853 |
+
}
|
854 |
+
|
855 |
+
std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
|
856 |
+
|
857 |
+
for (auto _ : state) {
|
858 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
859 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
860 |
+
return;
|
861 |
+
}
|
862 |
+
}
|
863 |
+
|
864 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
865 |
+
if (cpu_frequency != 0) {
|
866 |
+
state.counters["cpufreq"] = cpu_frequency;
|
867 |
+
}
|
868 |
+
|
869 |
+
state.counters["elements"] =
|
870 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
871 |
+
|
872 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
|
873 |
+
state.counters["bytes"] =
|
874 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
875 |
+
|
876 |
+
interpreter.reset();
|
877 |
+
}
|
878 |
+
|
879 |
+
void tflite_convert_qs8(benchmark::State& state) {
|
880 |
+
const size_t batch_size = state.range(0);
|
881 |
+
|
882 |
+
std::random_device random_device;
|
883 |
+
auto rng = std::mt19937(random_device());
|
884 |
+
auto i8rng = std::bind(
|
885 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
886 |
+
std::ref(rng));
|
887 |
+
|
888 |
+
flatbuffers::FlatBufferBuilder builder;
|
889 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
890 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
|
891 |
+
|
892 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
893 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
894 |
+
}};
|
895 |
+
|
896 |
+
const std::array<int32_t, 1> shape{{
|
897 |
+
static_cast<int32_t>(batch_size)
|
898 |
+
}};
|
899 |
+
|
900 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
901 |
+
tflite::CreateTensor(builder,
|
902 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
903 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
904 |
+
tflite::CreateQuantizationParameters(builder,
|
905 |
+
0 /*min*/, 0 /*max*/,
|
906 |
+
builder.CreateVector<float>({0.75f /* scale */}),
|
907 |
+
builder.CreateVector<int64_t>({-1 /* zero point */}))),
|
908 |
+
tflite::CreateTensor(builder,
|
909 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
910 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
911 |
+
tflite::CreateQuantizationParameters(builder,
|
912 |
+
0 /*min*/, 0 /*max*/,
|
913 |
+
builder.CreateVector<float>({0.5f /* scale */}),
|
914 |
+
builder.CreateVector<int64_t>({1 /* zero point */}))),
|
915 |
+
}};
|
916 |
+
|
917 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
918 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
919 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
920 |
+
0 /* opcode_index */,
|
921 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
922 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
923 |
+
|
924 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
925 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
926 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
927 |
+
builder,
|
928 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
929 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
930 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
931 |
+
builder.CreateVector(&op, 1));
|
932 |
+
|
933 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
|
934 |
+
|
935 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
936 |
+
TFLITE_SCHEMA_VERSION,
|
937 |
+
builder.CreateVector(&operator_code, 1),
|
938 |
+
builder.CreateVector(&subgraph, 1),
|
939 |
+
description,
|
940 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
941 |
+
|
942 |
+
builder.Finish(model_buffer);
|
943 |
+
|
944 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
945 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
946 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
947 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
948 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
949 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
950 |
+
return;
|
951 |
+
}
|
952 |
+
interpreter->SetNumThreads(1);
|
953 |
+
|
954 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
955 |
+
state.SkipWithError("failed to allocate tensors");
|
956 |
+
return;
|
957 |
+
}
|
958 |
+
|
959 |
+
std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
|
960 |
+
|
961 |
+
for (auto _ : state) {
|
962 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
963 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
964 |
+
return;
|
965 |
+
}
|
966 |
+
}
|
967 |
+
|
968 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
969 |
+
if (cpu_frequency != 0) {
|
970 |
+
state.counters["cpufreq"] = cpu_frequency;
|
971 |
+
}
|
972 |
+
|
973 |
+
state.counters["elements"] =
|
974 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
975 |
+
|
976 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
|
977 |
+
state.counters["bytes"] =
|
978 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
979 |
+
|
980 |
+
interpreter.reset();
|
981 |
+
}
|
982 |
+
|
983 |
+
void tflite_convert_qs8_f32(benchmark::State& state) {
|
984 |
+
const size_t batch_size = state.range(0);
|
985 |
+
|
986 |
+
std::random_device random_device;
|
987 |
+
auto rng = std::mt19937(random_device());
|
988 |
+
auto i8rng = std::bind(
|
989 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
990 |
+
std::ref(rng));
|
991 |
+
|
992 |
+
flatbuffers::FlatBufferBuilder builder;
|
993 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
994 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
|
995 |
+
|
996 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
997 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
998 |
+
}};
|
999 |
+
|
1000 |
+
const std::array<int32_t, 1> shape{{
|
1001 |
+
static_cast<int32_t>(batch_size)
|
1002 |
+
}};
|
1003 |
+
|
1004 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
1005 |
+
tflite::CreateTensor(builder,
|
1006 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1007 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
1008 |
+
tflite::CreateQuantizationParameters(builder,
|
1009 |
+
0 /*min*/, 0 /*max*/,
|
1010 |
+
builder.CreateVector<float>({1.0f / 255.0f /* scale */}),
|
1011 |
+
builder.CreateVector<int64_t>({-128 /* zero point */}))),
|
1012 |
+
tflite::CreateTensor(builder,
|
1013 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1014 |
+
tflite::TensorType_FLOAT32)
|
1015 |
+
}};
|
1016 |
+
|
1017 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
1018 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
1019 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
1020 |
+
0 /* opcode_index */,
|
1021 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
1022 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
1023 |
+
|
1024 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
1025 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
1026 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
1027 |
+
builder,
|
1028 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
1029 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
1030 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
1031 |
+
builder.CreateVector(&op, 1));
|
1032 |
+
|
1033 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
|
1034 |
+
|
1035 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
1036 |
+
TFLITE_SCHEMA_VERSION,
|
1037 |
+
builder.CreateVector(&operator_code, 1),
|
1038 |
+
builder.CreateVector(&subgraph, 1),
|
1039 |
+
description,
|
1040 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
1041 |
+
|
1042 |
+
builder.Finish(model_buffer);
|
1043 |
+
|
1044 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
1045 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
1046 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
1047 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
1048 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
1049 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
1050 |
+
return;
|
1051 |
+
}
|
1052 |
+
interpreter->SetNumThreads(1);
|
1053 |
+
|
1054 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
1055 |
+
state.SkipWithError("failed to allocate tensors");
|
1056 |
+
return;
|
1057 |
+
}
|
1058 |
+
|
1059 |
+
std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
|
1060 |
+
|
1061 |
+
for (auto _ : state) {
|
1062 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
1063 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
1064 |
+
return;
|
1065 |
+
}
|
1066 |
+
}
|
1067 |
+
|
1068 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
1069 |
+
if (cpu_frequency != 0) {
|
1070 |
+
state.counters["cpufreq"] = cpu_frequency;
|
1071 |
+
}
|
1072 |
+
|
1073 |
+
state.counters["elements"] =
|
1074 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
1075 |
+
|
1076 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
|
1077 |
+
state.counters["bytes"] =
|
1078 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
1079 |
+
|
1080 |
+
interpreter.reset();
|
1081 |
+
}
|
1082 |
+
|
1083 |
+
void tflite_convert_qu8(benchmark::State& state) {
|
1084 |
+
const size_t batch_size = state.range(0);
|
1085 |
+
|
1086 |
+
std::random_device random_device;
|
1087 |
+
auto rng = std::mt19937(random_device());
|
1088 |
+
auto u8rng = std::bind(
|
1089 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
|
1090 |
+
std::ref(rng));
|
1091 |
+
|
1092 |
+
flatbuffers::FlatBufferBuilder builder;
|
1093 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
1094 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
|
1095 |
+
|
1096 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
1097 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
1098 |
+
}};
|
1099 |
+
|
1100 |
+
const std::array<int32_t, 1> shape{{
|
1101 |
+
static_cast<int32_t>(batch_size)
|
1102 |
+
}};
|
1103 |
+
|
1104 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
1105 |
+
tflite::CreateTensor(builder,
|
1106 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1107 |
+
tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
|
1108 |
+
tflite::CreateQuantizationParameters(builder,
|
1109 |
+
0 /*min*/, 0 /*max*/,
|
1110 |
+
builder.CreateVector<float>({0.75f /* scale */}),
|
1111 |
+
builder.CreateVector<int64_t>({125 /* zero point */}))),
|
1112 |
+
tflite::CreateTensor(builder,
|
1113 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1114 |
+
tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
|
1115 |
+
tflite::CreateQuantizationParameters(builder,
|
1116 |
+
0 /*min*/, 0 /*max*/,
|
1117 |
+
builder.CreateVector<float>({0.5f /* scale */}),
|
1118 |
+
builder.CreateVector<int64_t>({130 /* zero point */})))
|
1119 |
+
}};
|
1120 |
+
|
1121 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
1122 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
1123 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
1124 |
+
0 /* opcode_index */,
|
1125 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
1126 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
1127 |
+
|
1128 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
1129 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
1130 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
1131 |
+
builder,
|
1132 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
1133 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
1134 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
1135 |
+
builder.CreateVector(&op, 1));
|
1136 |
+
|
1137 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
|
1138 |
+
|
1139 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
1140 |
+
TFLITE_SCHEMA_VERSION,
|
1141 |
+
builder.CreateVector(&operator_code, 1),
|
1142 |
+
builder.CreateVector(&subgraph, 1),
|
1143 |
+
description,
|
1144 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
1145 |
+
|
1146 |
+
builder.Finish(model_buffer);
|
1147 |
+
|
1148 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
1149 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
1150 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
1151 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
1152 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
1153 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
1154 |
+
return;
|
1155 |
+
}
|
1156 |
+
interpreter->SetNumThreads(1);
|
1157 |
+
|
1158 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
1159 |
+
state.SkipWithError("failed to allocate tensors");
|
1160 |
+
return;
|
1161 |
+
}
|
1162 |
+
|
1163 |
+
std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
|
1164 |
+
|
1165 |
+
for (auto _ : state) {
|
1166 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
1167 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
1168 |
+
return;
|
1169 |
+
}
|
1170 |
+
}
|
1171 |
+
|
1172 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
1173 |
+
if (cpu_frequency != 0) {
|
1174 |
+
state.counters["cpufreq"] = cpu_frequency;
|
1175 |
+
}
|
1176 |
+
|
1177 |
+
state.counters["elements"] =
|
1178 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
1179 |
+
|
1180 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
|
1181 |
+
state.counters["bytes"] =
|
1182 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
1183 |
+
|
1184 |
+
interpreter.reset();
|
1185 |
+
}
|
1186 |
+
|
1187 |
+
void tflite_convert_qu8_f32(benchmark::State& state) {
|
1188 |
+
const size_t batch_size = state.range(0);
|
1189 |
+
|
1190 |
+
std::random_device random_device;
|
1191 |
+
auto rng = std::mt19937(random_device());
|
1192 |
+
auto u8rng = std::bind(
|
1193 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
|
1194 |
+
std::ref(rng));
|
1195 |
+
|
1196 |
+
flatbuffers::FlatBufferBuilder builder;
|
1197 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
1198 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
|
1199 |
+
|
1200 |
+
std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
1201 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
1202 |
+
}};
|
1203 |
+
|
1204 |
+
const std::array<int32_t, 1> shape{{
|
1205 |
+
static_cast<int32_t>(batch_size)
|
1206 |
+
}};
|
1207 |
+
|
1208 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
1209 |
+
tflite::CreateTensor(builder,
|
1210 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1211 |
+
tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
|
1212 |
+
tflite::CreateQuantizationParameters(builder,
|
1213 |
+
0 /*min*/, 0 /*max*/,
|
1214 |
+
builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
|
1215 |
+
builder.CreateVector<int64_t>({128 /* zero point */}))),
|
1216 |
+
tflite::CreateTensor(builder,
|
1217 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
1218 |
+
tflite::TensorType_FLOAT32)
|
1219 |
+
}};
|
1220 |
+
|
1221 |
+
const std::array<int32_t, 1> op_inputs{{0}};
|
1222 |
+
const std::array<int32_t, 1> op_outputs{{1}};
|
1223 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
|
1224 |
+
0 /* opcode_index */,
|
1225 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
1226 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
1227 |
+
|
1228 |
+
const std::array<int32_t, 1> graph_inputs{{0}};
|
1229 |
+
const std::array<int32_t, 1> graph_outputs{{1}};
|
1230 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
1231 |
+
builder,
|
1232 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
1233 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
1234 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
1235 |
+
builder.CreateVector(&op, 1));
|
1236 |
+
|
1237 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
|
1238 |
+
|
1239 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
1240 |
+
TFLITE_SCHEMA_VERSION,
|
1241 |
+
builder.CreateVector(&operator_code, 1),
|
1242 |
+
builder.CreateVector(&subgraph, 1),
|
1243 |
+
description,
|
1244 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
1245 |
+
|
1246 |
+
builder.Finish(model_buffer);
|
1247 |
+
|
1248 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
1249 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
1250 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
1251 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
1252 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
1253 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
1254 |
+
return;
|
1255 |
+
}
|
1256 |
+
interpreter->SetNumThreads(1);
|
1257 |
+
|
1258 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
1259 |
+
state.SkipWithError("failed to allocate tensors");
|
1260 |
+
return;
|
1261 |
+
}
|
1262 |
+
|
1263 |
+
std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
|
1264 |
+
|
1265 |
+
for (auto _ : state) {
|
1266 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
1267 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
1268 |
+
return;
|
1269 |
+
}
|
1270 |
+
}
|
1271 |
+
|
1272 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
1273 |
+
if (cpu_frequency != 0) {
|
1274 |
+
state.counters["cpufreq"] = cpu_frequency;
|
1275 |
+
}
|
1276 |
+
|
1277 |
+
state.counters["elements"] =
|
1278 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
1279 |
+
|
1280 |
+
const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
|
1281 |
+
state.counters["bytes"] =
|
1282 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
1283 |
+
|
1284 |
+
interpreter.reset();
|
1285 |
+
}
|
1286 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
1287 |
+
|
1288 |
+
BENCHMARK(xnnpack_convert_f16_f32)
|
1289 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
1290 |
+
->UseRealTime();
|
1291 |
+
BENCHMARK(xnnpack_convert_f32_f16)
|
1292 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint16_t>)
|
1293 |
+
->UseRealTime();
|
1294 |
+
BENCHMARK(xnnpack_convert_f32_qs8)
|
1295 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
|
1296 |
+
->UseRealTime();
|
1297 |
+
BENCHMARK(xnnpack_convert_f32_qu8)
|
1298 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
|
1299 |
+
->UseRealTime();
|
1300 |
+
BENCHMARK(xnnpack_convert_qs8)
|
1301 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
|
1302 |
+
->UseRealTime();
|
1303 |
+
BENCHMARK(xnnpack_convert_qs8_f32)
|
1304 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
|
1305 |
+
->UseRealTime();
|
1306 |
+
BENCHMARK(xnnpack_convert_qu8)
|
1307 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
|
1308 |
+
->UseRealTime();
|
1309 |
+
BENCHMARK(xnnpack_convert_qu8_f32)
|
1310 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
|
1311 |
+
->UseRealTime();
|
1312 |
+
|
1313 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
1314 |
+
BENCHMARK(tflite_convert_f16_f32)
|
1315 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
1316 |
+
->UseRealTime();
|
1317 |
+
BENCHMARK(tflite_convert_f32_qs8)
|
1318 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
|
1319 |
+
->UseRealTime();
|
1320 |
+
BENCHMARK(tflite_convert_f32_qu8)
|
1321 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
|
1322 |
+
->UseRealTime();
|
1323 |
+
BENCHMARK(tflite_convert_qs8)
|
1324 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
|
1325 |
+
->UseRealTime();
|
1326 |
+
BENCHMARK(tflite_convert_qs8_f32)
|
1327 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
|
1328 |
+
->UseRealTime();
|
1329 |
+
BENCHMARK(tflite_convert_qu8)
|
1330 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
|
1331 |
+
->UseRealTime();
|
1332 |
+
BENCHMARK(tflite_convert_qu8_f32)
|
1333 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
|
1334 |
+
->UseRealTime();
|
1335 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
1336 |
+
|
1337 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
1338 |
+
BENCHMARK_MAIN();
|
1339 |
+
#endif
|
bench/convolution.cc
ADDED
@@ -0,0 +1,1768 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cfloat>
|
11 |
+
#include <cmath>
|
12 |
+
#include <functional>
|
13 |
+
#include <limits>
|
14 |
+
#include <memory>
|
15 |
+
#include <ostream>
|
16 |
+
#include <random>
|
17 |
+
#include <string>
|
18 |
+
#include <vector>
|
19 |
+
|
20 |
+
#include <xnnpack.h>
|
21 |
+
|
22 |
+
#include <benchmark/benchmark.h>
|
23 |
+
#include <fp16/fp16.h>
|
24 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
25 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
26 |
+
#include "tensorflow/lite/interpreter.h"
|
27 |
+
#include "tensorflow/lite/kernels/register.h"
|
28 |
+
#include "tensorflow/lite/model.h"
|
29 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
30 |
+
#include "tensorflow/lite/version.h"
|
31 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
32 |
+
#include "bench/utils.h"
|
33 |
+
|
34 |
+
void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
|
35 |
+
const size_t batch_size = state.range(0);
|
36 |
+
const size_t input_height = state.range(1);
|
37 |
+
const size_t input_width = state.range(2);
|
38 |
+
const size_t kernel_height = state.range(3);
|
39 |
+
const size_t kernel_width = state.range(4);
|
40 |
+
const size_t padding_height = state.range(5);
|
41 |
+
const size_t padding_width = state.range(6);
|
42 |
+
const size_t subsampling = state.range(7);
|
43 |
+
const size_t dilation = state.range(8);
|
44 |
+
const size_t groups = state.range(9);
|
45 |
+
const size_t group_input_channels = state.range(10);
|
46 |
+
const size_t group_output_channels = state.range(11);
|
47 |
+
|
48 |
+
std::random_device random_device;
|
49 |
+
auto rng = std::mt19937(random_device());
|
50 |
+
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
|
51 |
+
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
|
52 |
+
|
53 |
+
const size_t output_pixel_stride = groups * group_output_channels;
|
54 |
+
const size_t input_pixel_stride = groups * group_input_channels;
|
55 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
56 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
57 |
+
const size_t padding_left = padding_width / 2;
|
58 |
+
const size_t padding_top = padding_height / 2;
|
59 |
+
const size_t padding_right = padding_width - padding_left;
|
60 |
+
const size_t padding_bottom = padding_height - padding_top;
|
61 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
62 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
63 |
+
|
64 |
+
std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
|
65 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
66 |
+
std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
|
67 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
|
68 |
+
std::vector<int32_t> bias(groups * group_output_channels);
|
69 |
+
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
|
70 |
+
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
|
71 |
+
|
72 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
73 |
+
if (status != xnn_status_success) {
|
74 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
75 |
+
return;
|
76 |
+
}
|
77 |
+
|
78 |
+
const size_t num_buffers = 1 +
|
79 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
80 |
+
sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
|
81 |
+
std::vector<uint8_t> output(output_elements * num_buffers);
|
82 |
+
|
83 |
+
std::vector<xnn_operator_t> convolution_operators(num_buffers);
|
84 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
85 |
+
status = xnn_create_convolution2d_nhwc_qu8(
|
86 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
87 |
+
kernel_height, kernel_width,
|
88 |
+
subsampling, subsampling,
|
89 |
+
dilation, dilation,
|
90 |
+
groups, group_input_channels, group_output_channels,
|
91 |
+
input_pixel_stride, output_pixel_stride,
|
92 |
+
127, 0.5f,
|
93 |
+
127, 0.5f,
|
94 |
+
kernel.data(), bias.data(),
|
95 |
+
127, 0.5f, 0, 255,
|
96 |
+
0 /* flags */, nullptr, nullptr, &convolution_op);
|
97 |
+
if (status != xnn_status_success) {
|
98 |
+
state.SkipWithError("failed to create QUINT8 Convolution operator");
|
99 |
+
return;
|
100 |
+
}
|
101 |
+
}
|
102 |
+
|
103 |
+
for (size_t i = 0; i < convolution_operators.size(); i++) {
|
104 |
+
status = xnn_reshape_convolution2d_nhwc_qu8(
|
105 |
+
convolution_operators[i],
|
106 |
+
batch_size, input_height, input_width,
|
107 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
108 |
+
nullptr /* thread pool */);
|
109 |
+
status = xnn_setup_convolution2d_nhwc_qu8(
|
110 |
+
convolution_operators[i],
|
111 |
+
input.data(), output.data() + i * output_elements);
|
112 |
+
if (status != xnn_status_success) {
|
113 |
+
state.SkipWithError("failed to setup QUINT8 Convolution operator");
|
114 |
+
return;
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
size_t buffer_index = 0;
|
119 |
+
for (auto _ : state) {
|
120 |
+
state.PauseTiming();
|
121 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
|
122 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
123 |
+
state.ResumeTiming();
|
124 |
+
|
125 |
+
status = xnn_run_operator(convolution_operators[buffer_index],
|
126 |
+
nullptr /* thread pool */);
|
127 |
+
if (status != xnn_status_success) {
|
128 |
+
state.SkipWithError("failed to run QUINT8 Convolution operator");
|
129 |
+
return;
|
130 |
+
}
|
131 |
+
}
|
132 |
+
|
133 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
134 |
+
status = xnn_delete_operator(convolution_op);
|
135 |
+
if (status != xnn_status_success) {
|
136 |
+
state.SkipWithError("failed to delete QUINT8 Convolution operator");
|
137 |
+
return;
|
138 |
+
}
|
139 |
+
convolution_op = nullptr;
|
140 |
+
}
|
141 |
+
|
142 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
143 |
+
if (cpu_frequency != 0) {
|
144 |
+
state.counters["cpufreq"] = cpu_frequency;
|
145 |
+
}
|
146 |
+
|
147 |
+
state.counters["OPS"] = benchmark::Counter(
|
148 |
+
uint64_t(state.iterations()) * 2 *
|
149 |
+
batch_size * output_height * output_width *
|
150 |
+
groups * group_input_channels * group_output_channels *
|
151 |
+
kernel_height * kernel_width,
|
152 |
+
benchmark::Counter::kIsRate);
|
153 |
+
}
|
154 |
+
|
155 |
+
void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
|
156 |
+
const size_t batch_size = state.range(0);
|
157 |
+
const size_t input_height = state.range(1);
|
158 |
+
const size_t input_width = state.range(2);
|
159 |
+
const size_t kernel_height = state.range(3);
|
160 |
+
const size_t kernel_width = state.range(4);
|
161 |
+
const size_t padding_height = state.range(5);
|
162 |
+
const size_t padding_width = state.range(6);
|
163 |
+
const size_t subsampling = state.range(7);
|
164 |
+
const size_t dilation = state.range(8);
|
165 |
+
const size_t groups = state.range(9);
|
166 |
+
const size_t group_input_channels = state.range(10);
|
167 |
+
const size_t group_output_channels = state.range(11);
|
168 |
+
|
169 |
+
std::random_device random_device;
|
170 |
+
auto rng = std::mt19937(random_device());
|
171 |
+
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
|
172 |
+
auto i8rng = std::bind(
|
173 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
|
174 |
+
|
175 |
+
const size_t output_pixel_stride = groups * group_output_channels;
|
176 |
+
const size_t input_pixel_stride = groups * group_input_channels;
|
177 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
178 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
179 |
+
const size_t padding_left = padding_width / 2;
|
180 |
+
const size_t padding_top = padding_height / 2;
|
181 |
+
const size_t padding_right = padding_width - padding_left;
|
182 |
+
const size_t padding_bottom = padding_height - padding_top;
|
183 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
184 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
185 |
+
|
186 |
+
std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);
|
187 |
+
std::generate(input.begin(), input.end(), std::ref(i8rng));
|
188 |
+
std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
|
189 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
|
190 |
+
std::vector<int32_t> bias(groups * group_output_channels);
|
191 |
+
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
|
192 |
+
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
|
193 |
+
|
194 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
195 |
+
if (status != xnn_status_success) {
|
196 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
197 |
+
return;
|
198 |
+
}
|
199 |
+
|
200 |
+
const size_t num_buffers = 1 +
|
201 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
202 |
+
sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);
|
203 |
+
std::vector<int8_t> output(output_elements * num_buffers);
|
204 |
+
|
205 |
+
std::vector<xnn_operator_t> convolution_operators(num_buffers);
|
206 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
207 |
+
status = xnn_create_convolution2d_nhwc_qs8(
|
208 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
209 |
+
kernel_height, kernel_width,
|
210 |
+
subsampling, subsampling,
|
211 |
+
dilation, dilation,
|
212 |
+
groups, group_input_channels, group_output_channels,
|
213 |
+
input_pixel_stride, output_pixel_stride,
|
214 |
+
127, 0.5f, 0.5f,
|
215 |
+
kernel.data(), bias.data(),
|
216 |
+
127, 0.5f, -128, 127,
|
217 |
+
0 /* flags */, nullptr, nullptr, &convolution_op);
|
218 |
+
if (status != xnn_status_success) {
|
219 |
+
state.SkipWithError("failed to create QINT8 Convolution operator");
|
220 |
+
return;
|
221 |
+
}
|
222 |
+
}
|
223 |
+
|
224 |
+
for (size_t i = 0; i < convolution_operators.size(); i++) {
|
225 |
+
status = xnn_reshape_convolution2d_nhwc_qs8(
|
226 |
+
convolution_operators[i],
|
227 |
+
batch_size, input_height, input_width,
|
228 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
229 |
+
nullptr /* thread pool */);
|
230 |
+
status = xnn_setup_convolution2d_nhwc_qs8(
|
231 |
+
convolution_operators[i],
|
232 |
+
input.data(), output.data() + i * output_elements);
|
233 |
+
if (status != xnn_status_success) {
|
234 |
+
state.SkipWithError("failed to setup QINT8 Convolution operator");
|
235 |
+
return;
|
236 |
+
}
|
237 |
+
}
|
238 |
+
|
239 |
+
size_t buffer_index = 0;
|
240 |
+
for (auto _ : state) {
|
241 |
+
state.PauseTiming();
|
242 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
|
243 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
244 |
+
state.ResumeTiming();
|
245 |
+
|
246 |
+
status = xnn_run_operator(convolution_operators[buffer_index],
|
247 |
+
nullptr /* thread pool */);
|
248 |
+
if (status != xnn_status_success) {
|
249 |
+
state.SkipWithError("failed to run QINT8 Convolution operator");
|
250 |
+
return;
|
251 |
+
}
|
252 |
+
}
|
253 |
+
|
254 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
255 |
+
status = xnn_delete_operator(convolution_op);
|
256 |
+
if (status != xnn_status_success) {
|
257 |
+
state.SkipWithError("failed to delete QINT8 Convolution operator");
|
258 |
+
return;
|
259 |
+
}
|
260 |
+
convolution_op = nullptr;
|
261 |
+
}
|
262 |
+
|
263 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
264 |
+
if (cpu_frequency != 0) {
|
265 |
+
state.counters["cpufreq"] = cpu_frequency;
|
266 |
+
}
|
267 |
+
|
268 |
+
state.counters["OPS"] = benchmark::Counter(
|
269 |
+
uint64_t(state.iterations()) * 2 *
|
270 |
+
batch_size * output_height * output_width *
|
271 |
+
groups * group_input_channels * group_output_channels *
|
272 |
+
kernel_height * kernel_width,
|
273 |
+
benchmark::Counter::kIsRate);
|
274 |
+
}
|
275 |
+
|
276 |
+
void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
|
277 |
+
const size_t batch_size = state.range(0);
|
278 |
+
const size_t input_height = state.range(1);
|
279 |
+
const size_t input_width = state.range(2);
|
280 |
+
const size_t kernel_height = state.range(3);
|
281 |
+
const size_t kernel_width = state.range(4);
|
282 |
+
const size_t padding_height = state.range(5);
|
283 |
+
const size_t padding_width = state.range(6);
|
284 |
+
const size_t subsampling = state.range(7);
|
285 |
+
const size_t dilation = state.range(8);
|
286 |
+
const size_t groups = state.range(9);
|
287 |
+
const size_t group_input_channels = state.range(10);
|
288 |
+
const size_t group_output_channels = state.range(11);
|
289 |
+
|
290 |
+
std::random_device random_device;
|
291 |
+
auto rng = std::mt19937(random_device());
|
292 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
|
293 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
294 |
+
|
295 |
+
const size_t output_pixel_stride = groups * group_output_channels;
|
296 |
+
const size_t input_pixel_stride = groups * group_input_channels;
|
297 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
298 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
299 |
+
const size_t padding_left = padding_width / 2;
|
300 |
+
const size_t padding_top = padding_height / 2;
|
301 |
+
const size_t padding_right = padding_width - padding_left;
|
302 |
+
const size_t padding_bottom = padding_height - padding_top;
|
303 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
304 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
305 |
+
|
306 |
+
std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
307 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
308 |
+
std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
|
309 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
|
310 |
+
std::vector<uint16_t> bias(groups * group_output_channels);
|
311 |
+
std::generate(bias.begin(), bias.end(), std::ref(f16rng));
|
312 |
+
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
|
313 |
+
|
314 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
315 |
+
if (status != xnn_status_success) {
|
316 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
317 |
+
return;
|
318 |
+
}
|
319 |
+
|
320 |
+
const size_t num_buffers = 1 +
|
321 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
322 |
+
sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
|
323 |
+
std::vector<uint16_t> output(output_elements * num_buffers);
|
324 |
+
|
325 |
+
std::vector<xnn_operator_t> convolution_operators(num_buffers);
|
326 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
327 |
+
status = xnn_create_convolution2d_nhwc_f16(
|
328 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
329 |
+
kernel_height, kernel_width,
|
330 |
+
subsampling, subsampling,
|
331 |
+
dilation, dilation,
|
332 |
+
groups, group_input_channels, group_output_channels,
|
333 |
+
input_pixel_stride, output_pixel_stride,
|
334 |
+
kernel.data(), bias.data(),
|
335 |
+
-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
|
336 |
+
0 /* flags */, nullptr, nullptr, &convolution_op);
|
337 |
+
if (status != xnn_status_success) {
|
338 |
+
state.SkipWithError("failed to create FP16 Convolution operator");
|
339 |
+
return;
|
340 |
+
}
|
341 |
+
}
|
342 |
+
|
343 |
+
for (size_t i = 0; i < convolution_operators.size(); i++) {
|
344 |
+
status = xnn_reshape_convolution2d_nhwc_f16(
|
345 |
+
convolution_operators[i],
|
346 |
+
batch_size, input_height, input_width,
|
347 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
348 |
+
nullptr /* thread pool */);
|
349 |
+
status = xnn_setup_convolution2d_nhwc_f16(
|
350 |
+
convolution_operators[i],
|
351 |
+
input.data(), output.data() + i * output_elements);
|
352 |
+
if (status != xnn_status_success) {
|
353 |
+
state.SkipWithError("failed to setup FP16 Convolution operator");
|
354 |
+
return;
|
355 |
+
}
|
356 |
+
}
|
357 |
+
|
358 |
+
size_t buffer_index = 0;
|
359 |
+
for (auto _ : state) {
|
360 |
+
state.PauseTiming();
|
361 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
|
362 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
363 |
+
state.ResumeTiming();
|
364 |
+
|
365 |
+
status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
|
366 |
+
if (status != xnn_status_success) {
|
367 |
+
state.SkipWithError("failed to run FP16 Convolution operator");
|
368 |
+
return;
|
369 |
+
}
|
370 |
+
}
|
371 |
+
|
372 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
373 |
+
status = xnn_delete_operator(convolution_op);
|
374 |
+
if (status != xnn_status_success) {
|
375 |
+
state.SkipWithError("failed to delete FP16 Convolution operator");
|
376 |
+
return;
|
377 |
+
}
|
378 |
+
convolution_op = nullptr;
|
379 |
+
}
|
380 |
+
|
381 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
382 |
+
if (cpu_frequency != 0) {
|
383 |
+
state.counters["cpufreq"] = cpu_frequency;
|
384 |
+
}
|
385 |
+
|
386 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
387 |
+
uint64_t(state.iterations()) * 2 *
|
388 |
+
batch_size * output_height * output_width *
|
389 |
+
groups * group_input_channels * group_output_channels *
|
390 |
+
kernel_height * kernel_width,
|
391 |
+
benchmark::Counter::kIsRate);
|
392 |
+
}
|
393 |
+
|
394 |
+
void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
|
395 |
+
const size_t batch_size = state.range(0);
|
396 |
+
const size_t input_height = state.range(1);
|
397 |
+
const size_t input_width = state.range(2);
|
398 |
+
const size_t kernel_height = state.range(3);
|
399 |
+
const size_t kernel_width = state.range(4);
|
400 |
+
const size_t padding_height = state.range(5);
|
401 |
+
const size_t padding_width = state.range(6);
|
402 |
+
const size_t subsampling = state.range(7);
|
403 |
+
const size_t dilation = state.range(8);
|
404 |
+
const size_t groups = state.range(9);
|
405 |
+
const size_t group_input_channels = state.range(10);
|
406 |
+
const size_t group_output_channels = state.range(11);
|
407 |
+
|
408 |
+
std::random_device random_device;
|
409 |
+
auto rng = std::mt19937(random_device());
|
410 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
411 |
+
|
412 |
+
const size_t output_pixel_stride = groups * group_output_channels;
|
413 |
+
const size_t input_pixel_stride = groups * group_input_channels;
|
414 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
415 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
416 |
+
const size_t padding_left = padding_width / 2;
|
417 |
+
const size_t padding_top = padding_height / 2;
|
418 |
+
const size_t padding_right = padding_width - padding_left;
|
419 |
+
const size_t padding_bottom = padding_height - padding_top;
|
420 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
421 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
422 |
+
|
423 |
+
std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
|
424 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
425 |
+
std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
|
426 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
|
427 |
+
std::vector<float> bias(groups * group_output_channels);
|
428 |
+
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
|
429 |
+
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
|
430 |
+
|
431 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
432 |
+
if (status != xnn_status_success) {
|
433 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
434 |
+
return;
|
435 |
+
}
|
436 |
+
|
437 |
+
const size_t num_buffers = 1 +
|
438 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
439 |
+
sizeof(float) * (kernel.size() + bias.size() + output_elements));
|
440 |
+
std::vector<float> output(output_elements * num_buffers);
|
441 |
+
|
442 |
+
std::vector<xnn_operator_t> convolution_operators(num_buffers);
|
443 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
444 |
+
status = xnn_create_convolution2d_nhwc_f32(
|
445 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
446 |
+
kernel_height, kernel_width,
|
447 |
+
subsampling, subsampling,
|
448 |
+
dilation, dilation,
|
449 |
+
groups, group_input_channels, group_output_channels,
|
450 |
+
input_pixel_stride, output_pixel_stride,
|
451 |
+
kernel.data(), bias.data(),
|
452 |
+
-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
|
453 |
+
0 /* flags */, nullptr, nullptr, &convolution_op);
|
454 |
+
if (status != xnn_status_success) {
|
455 |
+
state.SkipWithError("failed to create FP32 Convolution operator");
|
456 |
+
return;
|
457 |
+
}
|
458 |
+
}
|
459 |
+
|
460 |
+
for (size_t i = 0; i < convolution_operators.size(); i++) {
|
461 |
+
status = xnn_reshape_convolution2d_nhwc_f32(
|
462 |
+
convolution_operators[i],
|
463 |
+
batch_size, input_height, input_width,
|
464 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
465 |
+
nullptr /* thread pool */);
|
466 |
+
status = xnn_setup_convolution2d_nhwc_f32(
|
467 |
+
convolution_operators[i],
|
468 |
+
input.data(), output.data() + i * output_elements);
|
469 |
+
if (status != xnn_status_success) {
|
470 |
+
state.SkipWithError("failed to setup FP32 Convolution operator");
|
471 |
+
return;
|
472 |
+
}
|
473 |
+
}
|
474 |
+
|
475 |
+
size_t buffer_index = 0;
|
476 |
+
for (auto _ : state) {
|
477 |
+
state.PauseTiming();
|
478 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
|
479 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
480 |
+
state.ResumeTiming();
|
481 |
+
|
482 |
+
status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
|
483 |
+
if (status != xnn_status_success) {
|
484 |
+
state.SkipWithError("failed to run FP32 Convolution operator");
|
485 |
+
return;
|
486 |
+
}
|
487 |
+
}
|
488 |
+
|
489 |
+
for (xnn_operator_t& convolution_op : convolution_operators) {
|
490 |
+
status = xnn_delete_operator(convolution_op);
|
491 |
+
if (status != xnn_status_success) {
|
492 |
+
state.SkipWithError("failed to delete FP32 Convolution operator");
|
493 |
+
return;
|
494 |
+
}
|
495 |
+
convolution_op = nullptr;
|
496 |
+
}
|
497 |
+
|
498 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
499 |
+
if (cpu_frequency != 0) {
|
500 |
+
state.counters["cpufreq"] = cpu_frequency;
|
501 |
+
}
|
502 |
+
|
503 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
504 |
+
uint64_t(state.iterations()) * 2 *
|
505 |
+
batch_size * output_height * output_width *
|
506 |
+
groups * group_input_channels * group_output_channels *
|
507 |
+
kernel_height * kernel_width,
|
508 |
+
benchmark::Counter::kIsRate);
|
509 |
+
}
|
510 |
+
|
511 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
512 |
+
void tflite_convolution_f32(benchmark::State& state, const char* net) {
|
513 |
+
const size_t batch_size = state.range(0);
|
514 |
+
const size_t input_height = state.range(1);
|
515 |
+
const size_t input_width = state.range(2);
|
516 |
+
const size_t kernel_height = state.range(3);
|
517 |
+
const size_t kernel_width = state.range(4);
|
518 |
+
const size_t padding_height = state.range(5);
|
519 |
+
const size_t padding_width = state.range(6);
|
520 |
+
const size_t subsampling = state.range(7);
|
521 |
+
const size_t dilation = state.range(8);
|
522 |
+
const size_t groups = state.range(9);
|
523 |
+
const size_t group_input_channels = state.range(10);
|
524 |
+
const size_t group_output_channels = state.range(11);
|
525 |
+
|
526 |
+
bool is_depthwise = false;
|
527 |
+
if (groups != 1) {
|
528 |
+
if (group_input_channels == 1) {
|
529 |
+
is_depthwise = true;
|
530 |
+
} else {
|
531 |
+
state.SkipWithError("grouped convolution is not supported");
|
532 |
+
return;
|
533 |
+
}
|
534 |
+
}
|
535 |
+
|
536 |
+
std::random_device random_device;
|
537 |
+
auto rng = std::mt19937(random_device());
|
538 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
539 |
+
|
540 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
541 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
542 |
+
|
543 |
+
tflite::Padding padding = tflite::Padding_VALID;
|
544 |
+
if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
|
545 |
+
padding = tflite::Padding_SAME;
|
546 |
+
} else if (padding_width == 0 && padding_height == 0) {
|
547 |
+
padding = tflite::Padding_VALID;
|
548 |
+
} else {
|
549 |
+
state.SkipWithError("unsupported padding");
|
550 |
+
return;
|
551 |
+
}
|
552 |
+
|
553 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
554 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
555 |
+
|
556 |
+
std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
|
557 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
|
558 |
+
std::vector<float> bias(groups * group_output_channels);
|
559 |
+
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
|
560 |
+
|
561 |
+
flatbuffers::FlatBufferBuilder builder;
|
562 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
563 |
+
CreateOperatorCode(
|
564 |
+
builder,
|
565 |
+
is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
|
566 |
+
0);
|
567 |
+
|
568 |
+
flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
|
569 |
+
builder,
|
570 |
+
padding,
|
571 |
+
static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
|
572 |
+
tflite::ActivationFunctionType_NONE,
|
573 |
+
static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
|
574 |
+
|
575 |
+
flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
|
576 |
+
builder,
|
577 |
+
padding,
|
578 |
+
static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
|
579 |
+
static_cast<int32_t>(group_output_channels),
|
580 |
+
tflite::ActivationFunctionType_NONE,
|
581 |
+
static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
|
582 |
+
|
583 |
+
flatbuffers::Offset<tflite::Buffer> buffers[3] = {
|
584 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
585 |
+
tflite::CreateBuffer(builder, builder.CreateVector(
|
586 |
+
reinterpret_cast<const uint8_t*>(kernel.data()),
|
587 |
+
sizeof(float) * kernel.size())),
|
588 |
+
tflite::CreateBuffer(builder, builder.CreateVector(
|
589 |
+
reinterpret_cast<const uint8_t*>(bias.data()),
|
590 |
+
sizeof(float) * bias.size())),
|
591 |
+
};
|
592 |
+
|
593 |
+
const int32_t input_shape[4] = {
|
594 |
+
static_cast<int32_t>(batch_size),
|
595 |
+
static_cast<int32_t>(input_height),
|
596 |
+
static_cast<int32_t>(input_width),
|
597 |
+
static_cast<int32_t>(groups * group_input_channels)
|
598 |
+
};
|
599 |
+
const int32_t output_shape[4] = {
|
600 |
+
static_cast<int32_t>(batch_size),
|
601 |
+
static_cast<int32_t>(output_height),
|
602 |
+
static_cast<int32_t>(output_width),
|
603 |
+
static_cast<int32_t>(groups * group_output_channels)
|
604 |
+
};
|
605 |
+
const int32_t filter_shape[4] = {
|
606 |
+
static_cast<int32_t>(group_output_channels),
|
607 |
+
static_cast<int32_t>(kernel_height),
|
608 |
+
static_cast<int32_t>(kernel_width),
|
609 |
+
static_cast<int32_t>(groups * group_input_channels)
|
610 |
+
};
|
611 |
+
const int32_t bias_shape[1] = {
|
612 |
+
static_cast<int32_t>(groups * group_output_channels)
|
613 |
+
};
|
614 |
+
|
615 |
+
flatbuffers::Offset<tflite::Tensor> tensors[4] = {
|
616 |
+
tflite::CreateTensor(builder,
|
617 |
+
builder.CreateVector<int32_t>(input_shape, 4),
|
618 |
+
tflite::TensorType_FLOAT32,
|
619 |
+
0 /* buffer id */,
|
620 |
+
builder.CreateString("input")),
|
621 |
+
tflite::CreateTensor(builder,
|
622 |
+
builder.CreateVector<int32_t>(filter_shape, 4),
|
623 |
+
tflite::TensorType_FLOAT32,
|
624 |
+
1 /* buffer id */,
|
625 |
+
builder.CreateString("filter")),
|
626 |
+
tflite::CreateTensor(builder,
|
627 |
+
builder.CreateVector<int32_t>(bias_shape, 1),
|
628 |
+
tflite::TensorType_FLOAT32,
|
629 |
+
2 /* buffer id */,
|
630 |
+
builder.CreateString("bias")),
|
631 |
+
tflite::CreateTensor(builder,
|
632 |
+
builder.CreateVector<int32_t>(output_shape, 4),
|
633 |
+
tflite::TensorType_FLOAT32,
|
634 |
+
0 /* buffer id */,
|
635 |
+
builder.CreateString("output")),
|
636 |
+
};
|
637 |
+
|
638 |
+
const int32_t op_inputs[3] = { 0, 1, 2 };
|
639 |
+
const int32_t op_outputs[1] = { 3 };
|
640 |
+
flatbuffers::Offset<tflite::Operator> op = CreateOperator(
|
641 |
+
builder,
|
642 |
+
0 /* opcode_index */,
|
643 |
+
builder.CreateVector<int32_t>(op_inputs, 3),
|
644 |
+
builder.CreateVector<int32_t>(op_outputs, 1),
|
645 |
+
is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
|
646 |
+
is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
|
647 |
+
/*custom_options */ 0,
|
648 |
+
tflite::CustomOptionsFormat_FLEXBUFFERS);
|
649 |
+
|
650 |
+
const int32_t graph_inputs[1] = { 0 };
|
651 |
+
const int32_t graph_outputs[1] = { 3 };
|
652 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
|
653 |
+
builder,
|
654 |
+
builder.CreateVector(tensors, 4),
|
655 |
+
builder.CreateVector<int32_t>(graph_inputs, 1),
|
656 |
+
builder.CreateVector<int32_t>(graph_outputs, 1),
|
657 |
+
builder.CreateVector(&op, 1),
|
658 |
+
builder.CreateString("Conv2D subgraph"));
|
659 |
+
|
660 |
+
flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
|
661 |
+
|
662 |
+
flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
663 |
+
TFLITE_SCHEMA_VERSION,
|
664 |
+
builder.CreateVector(&operator_code, 1),
|
665 |
+
builder.CreateVector(&subgraph, 1),
|
666 |
+
description,
|
667 |
+
builder.CreateVector(buffers, 3));
|
668 |
+
|
669 |
+
builder.Finish(model_buffer);
|
670 |
+
|
671 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
672 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
673 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
674 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
675 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk) {
|
676 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
677 |
+
return;
|
678 |
+
}
|
679 |
+
if (interpreter == nullptr) {
|
680 |
+
state.SkipWithError("TFLite interpreter is null");
|
681 |
+
return;
|
682 |
+
}
|
683 |
+
interpreter->SetNumThreads(1);
|
684 |
+
|
685 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
686 |
+
state.SkipWithError("failed to allocate tensors");
|
687 |
+
return;
|
688 |
+
}
|
689 |
+
|
690 |
+
std::generate(
|
691 |
+
interpreter->typed_tensor<float>(0),
|
692 |
+
interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
|
693 |
+
std::ref(f32rng));
|
694 |
+
|
695 |
+
for (auto _ : state) {
|
696 |
+
state.PauseTiming();
|
697 |
+
benchmark::utils::WipeCache();
|
698 |
+
benchmark::utils::PrefetchToL1(
|
699 |
+
interpreter->typed_tensor<float>(0),
|
700 |
+
batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
|
701 |
+
state.ResumeTiming();
|
702 |
+
|
703 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
704 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
705 |
+
return;
|
706 |
+
}
|
707 |
+
}
|
708 |
+
|
709 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
710 |
+
if (cpu_frequency != 0) {
|
711 |
+
state.counters["cpufreq"] = cpu_frequency;
|
712 |
+
}
|
713 |
+
|
714 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
715 |
+
uint64_t(state.iterations()) * 2 *
|
716 |
+
batch_size * output_height * output_width *
|
717 |
+
groups * group_input_channels * group_output_channels *
|
718 |
+
kernel_height * kernel_width,
|
719 |
+
benchmark::Counter::kIsRate);
|
720 |
+
|
721 |
+
interpreter.reset();
|
722 |
+
}
|
723 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
724 |
+
|
725 |
+
// ShuffleNet v1 with 1 group.
|
726 |
+
static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
|
727 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
728 |
+
|
729 |
+
/*************************** Conv 1 **************************/
|
730 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
731 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
732 |
+
/******************* Stage 2: stride-2 unit ******************/
|
733 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
734 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 36});
|
735 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 36, 1, 1});
|
736 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 120});
|
737 |
+
/******************* Stage 2: stride-1 units *****************/
|
738 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
739 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 36});
|
740 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 36, 1, 1});
|
741 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 144});
|
742 |
+
/******************* Stage 3: stride-2 unit ******************/
|
743 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
744 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 72});
|
745 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 72, 1, 1});
|
746 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 144});
|
747 |
+
/******************* Stage 3: stride-1 units *****************/
|
748 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
749 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 72});
|
750 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 72, 1, 1});
|
751 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 288});
|
752 |
+
/******************* Stage 4: stride-2 unit ******************/
|
753 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
754 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 144});
|
755 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 144, 1, 1});
|
756 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 288});
|
757 |
+
/******************* Stage 4: stride-1 units *****************/
|
758 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
759 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 144});
|
760 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 144, 1, 1});
|
761 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 576});
|
762 |
+
}
|
763 |
+
|
764 |
+
// ShuffleNet v1 with 2 groups.
|
765 |
+
static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
|
766 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
767 |
+
|
768 |
+
/*************************** Conv 1 **************************/
|
769 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
770 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
771 |
+
/******************* Stage 2: stride-2 unit ******************/
|
772 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
773 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 50});
|
774 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 50, 1, 1});
|
775 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 88});
|
776 |
+
/******************* Stage 2: stride-1 units *****************/
|
777 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
778 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 25});
|
779 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 50, 1, 1});
|
780 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 100});
|
781 |
+
/******************* Stage 3: stride-2 unit ******************/
|
782 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
783 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 50});
|
784 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 100, 1, 1});
|
785 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 100});
|
786 |
+
/******************* Stage 3: stride-1 units *****************/
|
787 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
788 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 50});
|
789 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 100, 1, 1});
|
790 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 200});
|
791 |
+
/******************* Stage 4: stride-2 unit ******************/
|
792 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
793 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 100});
|
794 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 200, 1, 1});
|
795 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 200});
|
796 |
+
/******************* Stage 4: stride-1 units *****************/
|
797 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
798 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 400, 100});
|
799 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 200, 1, 1});
|
800 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 400});
|
801 |
+
}
|
802 |
+
|
803 |
+
// ShuffleNet v1 with 3 groups.
|
804 |
+
static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
|
805 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
806 |
+
|
807 |
+
/*************************** Conv 1 **************************/
|
808 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
809 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
810 |
+
/******************* Stage 2: stride-2 unit ******************/
|
811 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
812 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 60});
|
813 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 60, 1, 1});
|
814 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 72});
|
815 |
+
/******************* Stage 2: stride-1 units *****************/
|
816 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
817 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 20});
|
818 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 60, 1, 1});
|
819 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 80});
|
820 |
+
/******************* Stage 3: stride-2 unit ******************/
|
821 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
822 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 40});
|
823 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 120, 1, 1});
|
824 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 80});
|
825 |
+
/******************* Stage 3: stride-1 units *****************/
|
826 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
827 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 40});
|
828 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 120, 1, 1});
|
829 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 160});
|
830 |
+
/******************* Stage 4: stride-2 unit ******************/
|
831 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
832 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 80});
|
833 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 240, 1, 1});
|
834 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 160});
|
835 |
+
/******************* Stage 4: stride-1 units *****************/
|
836 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
837 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 320, 80});
|
838 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 240, 1, 1});
|
839 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 320});
|
840 |
+
}
|
841 |
+
|
842 |
+
// ShuffleNet v1 with 4 groups.
|
843 |
+
static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
|
844 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
845 |
+
|
846 |
+
/*************************** Conv 1 **************************/
|
847 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
848 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
849 |
+
/******************* Stage 2: stride-2 unit ******************/
|
850 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
851 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 68});
|
852 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 68, 1, 1});
|
853 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 62});
|
854 |
+
/******************* Stage 2: stride-1 units *****************/
|
855 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
856 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 17});
|
857 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 68, 1, 1});
|
858 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 68});
|
859 |
+
/******************* Stage 3: stride-2 unit ******************/
|
860 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
861 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 34});
|
862 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 136, 1, 1});
|
863 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 68});
|
864 |
+
/******************* Stage 3: stride-1 units *****************/
|
865 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
866 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 34});
|
867 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 136, 1, 1});
|
868 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 136});
|
869 |
+
/******************* Stage 4: stride-2 unit ******************/
|
870 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
871 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 68});
|
872 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 272, 1, 1});
|
873 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 136});
|
874 |
+
/******************* Stage 4: stride-1 units *****************/
|
875 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
876 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 272, 68});
|
877 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 272, 1, 1});
|
878 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 272});
|
879 |
+
}
|
880 |
+
|
881 |
+
// ShuffleNet v1 with 8 groups.
|
882 |
+
static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
|
883 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
884 |
+
|
885 |
+
/*************************** Conv 1 **************************/
|
886 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
887 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
888 |
+
/******************* Stage 2: stride-2 unit ******************/
|
889 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
890 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 96});
|
891 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 96, 1, 1});
|
892 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 45});
|
893 |
+
/******************* Stage 2: stride-1 units *****************/
|
894 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
895 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 12});
|
896 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 96, 1, 1});
|
897 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 48});
|
898 |
+
/******************* Stage 3: stride-2 unit ******************/
|
899 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
900 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 24});
|
901 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
|
902 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 48});
|
903 |
+
/******************* Stage 3: stride-1 units *****************/
|
904 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
905 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 24});
|
906 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 192, 1, 1});
|
907 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 96});
|
908 |
+
/******************* Stage 4: stride-2 unit ******************/
|
909 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
910 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 48});
|
911 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 384, 1, 1});
|
912 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 96});
|
913 |
+
/******************* Stage 4: stride-1 units *****************/
|
914 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
915 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 192, 48});
|
916 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 384, 1, 1});
|
917 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 192});
|
918 |
+
}
|
919 |
+
|
920 |
+
// ShuffleNet v2 (0.5X scale)
|
921 |
+
static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
|
922 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
923 |
+
|
924 |
+
/*************************** Conv 1 **************************/
|
925 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
926 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
927 |
+
/************************** Stage 2 **************************/
|
928 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
929 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
|
930 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 24});
|
931 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 24});
|
932 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 24, 1, 1});
|
933 |
+
/************************** Stage 3 **************************/
|
934 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
935 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 48, 1, 1});
|
936 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 48});
|
937 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 48, 48});
|
938 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 48, 1, 1});
|
939 |
+
/************************** Stage 4 **************************/
|
940 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
941 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 96, 1, 1});
|
942 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 96});
|
943 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 96});
|
944 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 96, 1, 1});
|
945 |
+
/*************************** Conv 5 **************************/
|
946 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
947 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 192, 1024});
|
948 |
+
}
|
949 |
+
|
950 |
+
// ShuffleNet v2 (1.0X scale)
|
951 |
+
static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
|
952 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
953 |
+
|
954 |
+
/*************************** Conv 1 **************************/
|
955 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
956 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
957 |
+
/************************** Stage 2 **************************/
|
958 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
959 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
|
960 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 58});
|
961 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 58});
|
962 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 58, 1, 1});
|
963 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 58, 58});
|
964 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 58, 1, 1});
|
965 |
+
/************************** Stage 3 **************************/
|
966 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
967 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 116, 1, 1});
|
968 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 116, 116});
|
969 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 116, 116});
|
970 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 116, 1, 1});
|
971 |
+
/************************** Stage 4 **************************/
|
972 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
973 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 232, 1, 1});
|
974 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 232, 232});
|
975 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 232, 232});
|
976 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 232, 1, 1});
|
977 |
+
/*************************** Conv 5 **************************/
|
978 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
979 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 464, 1024});
|
980 |
+
}
|
981 |
+
|
982 |
+
// ShuffleNet v2 (1.5X scale)
|
983 |
+
static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
|
984 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
985 |
+
|
986 |
+
/*************************** Conv 1 **************************/
|
987 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
988 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
989 |
+
/************************** Stage 2 **************************/
|
990 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
991 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
|
992 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
|
993 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 88});
|
994 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 88, 1, 1});
|
995 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 88});
|
996 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
|
997 |
+
/************************** Stage 3 **************************/
|
998 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
999 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 176, 1, 1});
|
1000 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 176, 176});
|
1001 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 176, 176});
|
1002 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 176, 1, 1});
|
1003 |
+
/************************** Stage 4 **************************/
|
1004 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1005 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 352, 1, 1});
|
1006 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 352, 352});
|
1007 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 352, 352});
|
1008 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 352, 1, 1});
|
1009 |
+
/*************************** Conv 5 **************************/
|
1010 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1011 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 704, 1024});
|
1012 |
+
}
|
1013 |
+
|
1014 |
+
// ShuffleNet v2 (2.0X scale)
|
1015 |
+
static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
|
1016 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1017 |
+
|
1018 |
+
/*************************** Conv 1 **************************/
|
1019 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1020 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
|
1021 |
+
/************************** Stage 2 **************************/
|
1022 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1023 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
|
1024 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 122});
|
1025 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 122});
|
1026 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 122, 1, 1});
|
1027 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 122, 122});
|
1028 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 122, 1, 1});
|
1029 |
+
/************************** Stage 3 **************************/
|
1030 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1031 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 244, 1, 1});
|
1032 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 244, 244});
|
1033 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 244, 244});
|
1034 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 244, 1, 1});
|
1035 |
+
/************************** Stage 4 **************************/
|
1036 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1037 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 488, 1, 1});
|
1038 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 488, 488});
|
1039 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 488, 488});
|
1040 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 488, 1, 1});
|
1041 |
+
/*************************** Conv 5 **************************/
|
1042 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1043 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 976, 2048});
|
1044 |
+
}
|
1045 |
+
|
1046 |
+
static void MobileNetV1(benchmark::internal::Benchmark* b) {
|
1047 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1048 |
+
|
1049 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1050 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
|
1051 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
|
1052 |
+
b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 64});
|
1053 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
|
1054 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 128});
|
1055 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 128, 1, 1});
|
1056 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 128, 128});
|
1057 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 128, 1, 1});
|
1058 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 256});
|
1059 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 256, 1, 1});
|
1060 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 256, 256});
|
1061 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 256, 1, 1});
|
1062 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 512});
|
1063 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 512, 1, 1});
|
1064 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
|
1065 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 512, 1, 1});
|
1066 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 1024});
|
1067 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1024, 1, 1});
|
1068 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 1024, 1024});
|
1069 |
+
}
|
1070 |
+
|
1071 |
+
static void MobileNetV2(benchmark::internal::Benchmark* b) {
|
1072 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1073 |
+
|
1074 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1075 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
|
1076 |
+
|
1077 |
+
/************************ Bottleneck 1 ***********************/
|
1078 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1079 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
|
1080 |
+
b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 16});
|
1081 |
+
|
1082 |
+
/************************ Bottleneck 2 ***********************/
|
1083 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1084 |
+
b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 96});
|
1085 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 96, 1, 1});
|
1086 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 96, 24});
|
1087 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
|
1088 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 144, 1, 1});
|
1089 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 144, 24});
|
1090 |
+
|
1091 |
+
/************************ Bottleneck 3 ***********************/
|
1092 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1093 |
+
//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
|
1094 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 144, 1, 1});
|
1095 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 32});
|
1096 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
|
1097 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
|
1098 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
|
1099 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
|
1100 |
+
//b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
|
1101 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
|
1102 |
+
|
1103 |
+
/************************ Bottleneck 4 ***********************/
|
1104 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1105 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
|
1106 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
|
1107 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 192, 64});
|
1108 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
|
1109 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
|
1110 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
|
1111 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
|
1112 |
+
//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
|
1113 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
|
1114 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
|
1115 |
+
//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
|
1116 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
|
1117 |
+
|
1118 |
+
/************************ Bottleneck 5 ***********************/
|
1119 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1120 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
|
1121 |
+
//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
|
1122 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 96});
|
1123 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1124 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
|
1125 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
|
1126 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1127 |
+
//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
|
1128 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
|
1129 |
+
|
1130 |
+
/************************ Bottleneck 6 ***********************/
|
1131 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1132 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1133 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 576, 1, 1});
|
1134 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 160});
|
1135 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1136 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
|
1137 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
|
1138 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1139 |
+
//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
|
1140 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
|
1141 |
+
|
1142 |
+
/************************ Bottleneck 7 ***********************/
|
1143 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1144 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1145 |
+
//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
|
1146 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 320});
|
1147 |
+
|
1148 |
+
/******************** Pre-pooling Conv2D *********************/
|
1149 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1150 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 320, 1280});
|
1151 |
+
/******************** Post-pooling Conv2D ********************/
|
1152 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1153 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1000});
|
1154 |
+
}
|
1155 |
+
|
1156 |
+
static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
|
1157 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1158 |
+
|
1159 |
+
/*********************** Initial Stage ***********************/
|
1160 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1161 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
|
1162 |
+
/*********************** Bottleneck 1 ************************/
|
1163 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1164 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 16, 1, 1});
|
1165 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 16, 8});
|
1166 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 8, 16});
|
1167 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 16});
|
1168 |
+
/*********************** Bottleneck 2 ************************/
|
1169 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1170 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 72});
|
1171 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 72, 1, 1});
|
1172 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 24});
|
1173 |
+
/*********************** Bottleneck 3 ************************/
|
1174 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1175 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
|
1176 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
|
1177 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 24});
|
1178 |
+
/*********************** Bottleneck 4 ************************/
|
1179 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1180 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 96});
|
1181 |
+
b->Args({1, 28, 28, 5, 5, 4, 4, 2, 1, 96, 1, 1});
|
1182 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 96, 24});
|
1183 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 96});
|
1184 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 40});
|
1185 |
+
/*********************** Bottleneck 5 ************************/
|
1186 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1187 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
|
1188 |
+
b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
|
1189 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
|
1190 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
|
1191 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
|
1192 |
+
/*********************** Bottleneck 6 ************************/
|
1193 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1194 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
|
1195 |
+
//b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
|
1196 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
|
1197 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
|
1198 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
|
1199 |
+
/*********************** Bottleneck 7 ************************/
|
1200 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1201 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 120});
|
1202 |
+
b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 120, 1, 1});
|
1203 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
|
1204 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
|
1205 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 120, 48});
|
1206 |
+
/*********************** Bottleneck 8 ************************/
|
1207 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1208 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 144});
|
1209 |
+
b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 144, 1, 1});
|
1210 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 40});
|
1211 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 40, 144});
|
1212 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 144, 48});
|
1213 |
+
/*********************** Bottleneck 9 ************************/
|
1214 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1215 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 288});
|
1216 |
+
b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 288, 1, 1});
|
1217 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 288, 72});
|
1218 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 288});
|
1219 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 288, 96});
|
1220 |
+
/*********************** Bottleneck 10 ***********************/
|
1221 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1222 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1223 |
+
b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
|
1224 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
|
1225 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
|
1226 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
|
1227 |
+
/*********************** Bottleneck 11 ***********************/
|
1228 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1229 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1230 |
+
//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
|
1231 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
|
1232 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
|
1233 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
|
1234 |
+
/************************ Last Stage ************************/
|
1235 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1236 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
|
1237 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 1024});
|
1238 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1024, 1001});
|
1239 |
+
}
|
1240 |
+
|
1241 |
+
static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
|
1242 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1243 |
+
|
1244 |
+
/*********************** Initial Stage ***********************/
|
1245 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1246 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
|
1247 |
+
/*********************** Bottleneck 1 ************************/
|
1248 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1249 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 16, 1, 1});
|
1250 |
+
b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 16});
|
1251 |
+
/*********************** Bottleneck 2 ************************/
|
1252 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1253 |
+
b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 64});
|
1254 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
|
1255 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 24});
|
1256 |
+
/*********************** Bottleneck 3 ************************/
|
1257 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1258 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
|
1259 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 72, 1, 1});
|
1260 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 72, 24});
|
1261 |
+
/*********************** Bottleneck 4 ************************/
|
1262 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1263 |
+
//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
|
1264 |
+
b->Args({1, 56, 56, 5, 5, 4, 4, 2, 1, 72, 1, 1});
|
1265 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 24});
|
1266 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 72});
|
1267 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 40});
|
1268 |
+
/*********************** Bottleneck 5 ************************/
|
1269 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1270 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
|
1271 |
+
b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
|
1272 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
|
1273 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
|
1274 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
|
1275 |
+
/*********************** Bottleneck 6 ************************/
|
1276 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1277 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
|
1278 |
+
//b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
|
1279 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
|
1280 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
|
1281 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
|
1282 |
+
/*********************** Bottleneck 7 ************************/
|
1283 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1284 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 240});
|
1285 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 240, 1, 1});
|
1286 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 80});
|
1287 |
+
/*********************** Bottleneck 8 ************************/
|
1288 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1289 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 200});
|
1290 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 200, 1, 1});
|
1291 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 200, 80});
|
1292 |
+
/*********************** Bottleneck 9 ************************/
|
1293 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1294 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
|
1295 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
|
1296 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
|
1297 |
+
/********************** Bottleneck 10 ***********************/
|
1298 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1299 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
|
1300 |
+
//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
|
1301 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
|
1302 |
+
/********************** Bottleneck 11 ***********************/
|
1303 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1304 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 480});
|
1305 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 480, 1, 1});
|
1306 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 480, 120});
|
1307 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 480});
|
1308 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 480, 112});
|
1309 |
+
/********************** Bottleneck 12 ***********************/
|
1310 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1311 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
|
1312 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 672, 1, 1});
|
1313 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 672, 168});
|
1314 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 168, 672});
|
1315 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 672, 112});
|
1316 |
+
/********************** Bottleneck 13 ***********************/
|
1317 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1318 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
|
1319 |
+
b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 672, 1, 1});
|
1320 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 672, 160});
|
1321 |
+
/********************** Bottleneck 14 ***********************/
|
1322 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1323 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1324 |
+
b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
|
1325 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
|
1326 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
|
1327 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
|
1328 |
+
/********************** Bottleneck 15 ***********************/
|
1329 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1330 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1331 |
+
//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
|
1332 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
|
1333 |
+
//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
|
1334 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
|
1335 |
+
/************************ Last Stage ***********************/
|
1336 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1337 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
|
1338 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 1280});
|
1339 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1001});
|
1340 |
+
}
|
1341 |
+
|
1342 |
+
// SqueezeNet 1.0
|
1343 |
+
static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
|
1344 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1345 |
+
|
1346 |
+
/************************** Conv 1 *************************/
|
1347 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1348 |
+
b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 96});
|
1349 |
+
/************************** Fire 2 *************************/
|
1350 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1351 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 96, 16});
|
1352 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
|
1353 |
+
b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
|
1354 |
+
/************************** Fire 3 *************************/
|
1355 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1356 |
+
b->Args({1, 56, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
|
1357 |
+
//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
|
1358 |
+
//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
|
1359 |
+
/************************** Fire 4 *************************/
|
1360 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1361 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 32});
|
1362 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 32, 128});
|
1363 |
+
b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 32, 128});
|
1364 |
+
/************************** Fire 5 *************************/
|
1365 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1366 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
|
1367 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
|
1368 |
+
b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
|
1369 |
+
/************************** Fire 6 *************************/
|
1370 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1371 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 48});
|
1372 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
|
1373 |
+
b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
|
1374 |
+
/************************** Fire 7 *************************/
|
1375 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1376 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 48});
|
1377 |
+
//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
|
1378 |
+
//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
|
1379 |
+
/************************** Fire 8 *************************/
|
1380 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1381 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 64});
|
1382 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1383 |
+
b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 64, 256});
|
1384 |
+
/************************** Fire 9 *************************/
|
1385 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1386 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
|
1387 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1388 |
+
b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
|
1389 |
+
/************************* Conv 10 *************************/
|
1390 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1391 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
|
1392 |
+
}
|
1393 |
+
|
1394 |
+
// SqueezeNet 1.1
|
1395 |
+
static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
|
1396 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1397 |
+
|
1398 |
+
/************************** Conv 1 *************************/
|
1399 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1400 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 64});
|
1401 |
+
/************************** Fire 2 *************************/
|
1402 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1403 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 64, 16});
|
1404 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
|
1405 |
+
b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
|
1406 |
+
/************************** Fire 3 *************************/
|
1407 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1408 |
+
b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
|
1409 |
+
//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
|
1410 |
+
//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
|
1411 |
+
/************************** Fire 4 *************************/
|
1412 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1413 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 128, 32});
|
1414 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
|
1415 |
+
b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
|
1416 |
+
/************************** Fire 5 *************************/
|
1417 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1418 |
+
b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
|
1419 |
+
//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
|
1420 |
+
//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
|
1421 |
+
/************************** Fire 6 *************************/
|
1422 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1423 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 256, 48});
|
1424 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
|
1425 |
+
b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
|
1426 |
+
/************************** Fire 7 *************************/
|
1427 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1428 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 48});
|
1429 |
+
//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
|
1430 |
+
//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
|
1431 |
+
/************************** Fire 8 *************************/
|
1432 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1433 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 64});
|
1434 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1435 |
+
b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
|
1436 |
+
/************************** Fire 9 *************************/
|
1437 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1438 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
|
1439 |
+
//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1440 |
+
//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
|
1441 |
+
/************************* Conv 10 *************************/
|
1442 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1443 |
+
b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
|
1444 |
+
}
|
1445 |
+
|
1446 |
+
static void InceptionV3(benchmark::internal::Benchmark* b) {
|
1447 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1448 |
+
|
1449 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1450 |
+
b->Args({1, 299, 299, 3, 3, 0, 0, 2, 1, 1, 3, 32});
|
1451 |
+
b->Args({1, 149, 149, 3, 3, 0, 0, 1, 1, 1, 32, 32});
|
1452 |
+
b->Args({1, 147, 147, 3, 3, 2, 2, 1, 1, 1, 32, 64});
|
1453 |
+
b->Args({1, 73, 73, 1, 1, 0, 0, 1, 1, 1, 64, 80});
|
1454 |
+
b->Args({1, 73, 73, 3, 3, 0, 0, 1, 1, 1, 80, 192});
|
1455 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 64});
|
1456 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 48});
|
1457 |
+
b->Args({1, 35, 35, 5, 5, 4, 4, 1, 1, 1, 48, 64});
|
1458 |
+
b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 64, 96});
|
1459 |
+
b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 96, 96});
|
1460 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 32});
|
1461 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 64});
|
1462 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 48});
|
1463 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 64});
|
1464 |
+
b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 48});
|
1465 |
+
b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 288, 384});
|
1466 |
+
b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 96, 96});
|
1467 |
+
b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 192});
|
1468 |
+
b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 128});
|
1469 |
+
b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 128});
|
1470 |
+
b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 192});
|
1471 |
+
b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 128});
|
1472 |
+
b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 192});
|
1473 |
+
b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 160});
|
1474 |
+
b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 160});
|
1475 |
+
b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 192});
|
1476 |
+
b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 160});
|
1477 |
+
b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 192});
|
1478 |
+
b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 192, 192});
|
1479 |
+
b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 192, 192});
|
1480 |
+
b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 320});
|
1481 |
+
b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 192});
|
1482 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 320});
|
1483 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 384});
|
1484 |
+
b->Args({1, 8, 8, 1, 3, 0, 2, 1, 1, 1, 384, 384});
|
1485 |
+
b->Args({1, 8, 8, 3, 1, 2, 0, 1, 1, 1, 384, 384});
|
1486 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 448});
|
1487 |
+
b->Args({1, 8, 8, 3, 3, 2, 2, 1, 1, 1, 448, 384});
|
1488 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 192});
|
1489 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 320});
|
1490 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 384});
|
1491 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 448});
|
1492 |
+
b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 192});
|
1493 |
+
b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2048, 1001});
|
1494 |
+
}
|
1495 |
+
|
1496 |
+
static void ResNet18(benchmark::internal::Benchmark* b) {
|
1497 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1498 |
+
|
1499 |
+
/************************* Conv 1 *************************/
|
1500 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1501 |
+
b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
|
1502 |
+
/************************ Conv 2.X ************************/
|
1503 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1504 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
|
1505 |
+
/************************ Conv 3.X ************************/
|
1506 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1507 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 64, 128});
|
1508 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
|
1509 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 64, 128});
|
1510 |
+
/************************ Conv 4.X ************************/
|
1511 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1512 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 128, 256});
|
1513 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
|
1514 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 128, 256});
|
1515 |
+
/************************ Conv 5.X ************************/
|
1516 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1517 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 256, 512});
|
1518 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
|
1519 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 256, 512});
|
1520 |
+
}
|
1521 |
+
|
1522 |
+
static void ResNet50(benchmark::internal::Benchmark* b) {
|
1523 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1524 |
+
|
1525 |
+
/************************* Conv 1 *************************/
|
1526 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1527 |
+
b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
|
1528 |
+
/************************ Conv 2.1 ************************/
|
1529 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1530 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 64});
|
1531 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
|
1532 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1533 |
+
//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1534 |
+
/************************ Conv 2.X ************************/
|
1535 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1536 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 64});
|
1537 |
+
//b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
|
1538 |
+
//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
|
1539 |
+
/************************ Conv 3.1 ************************/
|
1540 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1541 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 128});
|
1542 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 128, 128});
|
1543 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
|
1544 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 256, 512});
|
1545 |
+
/************************ Conv 3.X ************************/
|
1546 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1547 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 128});
|
1548 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
|
1549 |
+
//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
|
1550 |
+
/************************ Conv 4.1 ************************/
|
1551 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1552 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 256});
|
1553 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 256, 256});
|
1554 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
|
1555 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 512, 1024});
|
1556 |
+
/************************ Conv 4.X ************************/
|
1557 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1558 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 256});
|
1559 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
|
1560 |
+
//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
|
1561 |
+
/************************ Conv 5.1 ************************/
|
1562 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1563 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 512});
|
1564 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 512, 512});
|
1565 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
|
1566 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 1024, 2048});
|
1567 |
+
/************************ Conv 5.X ************************/
|
1568 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1569 |
+
b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 2048, 512});
|
1570 |
+
b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
|
1571 |
+
//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
|
1572 |
+
}
|
1573 |
+
|
1574 |
+
static void VGG(benchmark::internal::Benchmark* b) {
|
1575 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1576 |
+
|
1577 |
+
/************************* Conv 1.1 ************************/
|
1578 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1579 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 3, 64});
|
1580 |
+
/************************* Conv 1.2 ************************/
|
1581 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1582 |
+
b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 64, 64});
|
1583 |
+
|
1584 |
+
/************************* Conv 2.1 ************************/
|
1585 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1586 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 64, 128});
|
1587 |
+
/************************* Conv 2.2 ************************/
|
1588 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1589 |
+
b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 128, 128});
|
1590 |
+
|
1591 |
+
/************************* Conv 3.1 ************************/
|
1592 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1593 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 128, 256});
|
1594 |
+
/************************* Conv 3.2 ************************/
|
1595 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1596 |
+
b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 256, 256});
|
1597 |
+
/************************* Conv 3.3 ************************/
|
1598 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1599 |
+
b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 256});
|
1600 |
+
|
1601 |
+
/************************* Conv 4.1 ************************/
|
1602 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1603 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 256, 512});
|
1604 |
+
/************************* Conv 4.2 ************************/
|
1605 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1606 |
+
b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 512, 512});
|
1607 |
+
/************************* Conv 4.3 ************************/
|
1608 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1609 |
+
b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 512});
|
1610 |
+
|
1611 |
+
/************************* Conv 5.X ************************/
|
1612 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1613 |
+
b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 512, 512});
|
1614 |
+
/************************* Conv 5.3 ************************/
|
1615 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1616 |
+
b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
|
1617 |
+
}
|
1618 |
+
|
1619 |
+
// SRCNN (9-1-5)
|
1620 |
+
static void SRCNN915(benchmark::internal::Benchmark* b) {
|
1621 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1622 |
+
|
1623 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1624 |
+
b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
|
1625 |
+
b->Args({1, 376, 376, 1, 1, 0, 0, 1, 1, 1, 64, 32});
|
1626 |
+
b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 32, 1});
|
1627 |
+
}
|
1628 |
+
|
1629 |
+
// SRCNN (9-3-5)
|
1630 |
+
static void SRCNN935(benchmark::internal::Benchmark* b) {
|
1631 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1632 |
+
|
1633 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1634 |
+
b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
|
1635 |
+
b->Args({1, 376, 376, 3, 3, 0, 0, 1, 1, 1, 64, 32});
|
1636 |
+
b->Args({1, 374, 374, 5, 5, 0, 0, 1, 1, 1, 32, 1});
|
1637 |
+
}
|
1638 |
+
|
1639 |
+
// SRCNN (9-5-5)
|
1640 |
+
static void SRCNN955(benchmark::internal::Benchmark* b) {
|
1641 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
|
1642 |
+
|
1643 |
+
/* N H W KH KW PH PW S D G GCin GCout */
|
1644 |
+
b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
|
1645 |
+
b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 64, 32});
|
1646 |
+
b->Args({1, 372, 372, 5, 5, 0, 0, 1, 1, 1, 32, 1});
|
1647 |
+
}
|
1648 |
+
|
1649 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
|
1650 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
|
1651 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
|
1652 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
|
1653 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
1654 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
1655 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
1656 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
1657 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
1658 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
|
1659 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
|
1660 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
|
1661 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
|
1662 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
|
1663 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
|
1664 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
|
1665 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
|
1666 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
|
1667 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
|
1668 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
|
1669 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
|
1670 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
|
1671 |
+
|
1672 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
|
1673 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
|
1674 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
|
1675 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
|
1676 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
1677 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
1678 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
1679 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
1680 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
1681 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
|
1682 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
|
1683 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
|
1684 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
|
1685 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
|
1686 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
|
1687 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
|
1688 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
|
1689 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
|
1690 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
|
1691 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
|
1692 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
|
1693 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
|
1694 |
+
|
1695 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
|
1696 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
|
1697 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
|
1698 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
|
1699 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
1700 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
1701 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
1702 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
1703 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
1704 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
|
1705 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
|
1706 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
|
1707 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
|
1708 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
|
1709 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
|
1710 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
|
1711 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
|
1712 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
|
1713 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();
|
1714 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
|
1715 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
|
1716 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
|
1717 |
+
|
1718 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
|
1719 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
|
1720 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
|
1721 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
|
1722 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
1723 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
1724 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
1725 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
1726 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
1727 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
|
1728 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
|
1729 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
|
1730 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
|
1731 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
|
1732 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
|
1733 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
|
1734 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
|
1735 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
|
1736 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();
|
1737 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
|
1738 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
|
1739 |
+
BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
|
1740 |
+
|
1741 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
1742 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
|
1743 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
|
1744 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
|
1745 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
|
1746 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
|
1747 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
|
1748 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
|
1749 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
|
1750 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
|
1751 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
|
1752 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
|
1753 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
|
1754 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
|
1755 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
|
1756 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
|
1757 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
|
1758 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
|
1759 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
|
1760 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
|
1761 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
|
1762 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
|
1763 |
+
BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
|
1764 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
1765 |
+
|
1766 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
1767 |
+
BENCHMARK_MAIN();
|
1768 |
+
#endif
|
bench/cs16-bfly4.cc
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
#include <algorithm>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <numeric>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include "bench/utils.h"
|
14 |
+
#include <benchmark/benchmark.h>
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/fft.h>
|
20 |
+
#include <xnnpack/microfnptr.h>
|
21 |
+
#include <xnnpack/microparams-init.h>
|
22 |
+
|
23 |
+
|
24 |
+
void cs16_bfly4(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_cs16_bfly4_ukernel_fn bfly4,
|
27 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
28 |
+
{
|
29 |
+
if ((isa_check != nullptr) && !isa_check(state)) {
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
const size_t fft_size = state.range(0);
|
33 |
+
const size_t batch = state.range(1);
|
34 |
+
const size_t samples = state.range(2);
|
35 |
+
const size_t stride = state.range(3);
|
36 |
+
|
37 |
+
assert(fft_size == samples * stride * 4); // 4 for bfly4.
|
38 |
+
|
39 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> output(fft_size * 2);
|
40 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(fft_size * 3 / 4 * 2);
|
41 |
+
|
42 |
+
std::iota(output.begin(), output.end(), 0);
|
43 |
+
std::iota(twiddle.begin(), twiddle.end(), 0);
|
44 |
+
|
45 |
+
for (auto _ : state) {
|
46 |
+
bfly4(batch, samples * sizeof(int16_t) * 2, output.data(), twiddle.data(), stride * sizeof(int16_t) * 2);
|
47 |
+
}
|
48 |
+
|
49 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
50 |
+
if (cpu_frequency != 0) {
|
51 |
+
state.counters["cpufreq"] = cpu_frequency;
|
52 |
+
}
|
53 |
+
}
|
54 |
+
|
55 |
+
static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
|
56 |
+
{
|
57 |
+
b->ArgNames({"fft_size", "batch", "samples", "stride"});
|
58 |
+
b->Args({256, 1, 1, 64});
|
59 |
+
b->Args({256, 4, 1, 64});
|
60 |
+
b->Args({256, 1, 4, 16});
|
61 |
+
b->Args({256, 4, 4, 16});
|
62 |
+
b->Args({256, 1, 16, 4});
|
63 |
+
b->Args({256, 4, 16, 4});
|
64 |
+
b->Args({256, 1, 64, 1});
|
65 |
+
}
|
66 |
+
|
67 |
+
static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b)
|
68 |
+
{
|
69 |
+
b->ArgNames({"fft_size", "batch", "samples", "stride"});
|
70 |
+
b->Args({256, 1, 1, 64});
|
71 |
+
b->Args({256, 4, 1, 64});
|
72 |
+
b->Args({256, 16, 1, 64});
|
73 |
+
b->Args({256, 64, 1, 64});
|
74 |
+
}
|
75 |
+
static void BenchmarkSamples4KernelSize(benchmark::internal::Benchmark* b)
|
76 |
+
{
|
77 |
+
b->ArgNames({"fft_size", "batch", "samples", "stride"});
|
78 |
+
b->Args({256, 1, 4, 16});
|
79 |
+
b->Args({256, 4, 4, 16});
|
80 |
+
b->Args({256, 16, 4, 16});
|
81 |
+
}
|
82 |
+
|
83 |
+
#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
|
84 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x1, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1)
|
85 |
+
->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
|
86 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x2, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2)
|
87 |
+
->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
|
88 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x4, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4)
|
89 |
+
->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
|
90 |
+
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
|
91 |
+
|
92 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
93 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples1__neon, xnn_cs16_bfly4_samples1_ukernel__neon)
|
94 |
+
->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
|
95 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples4__neon, xnn_cs16_bfly4_samples4_ukernel__neon)
|
96 |
+
->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
|
97 |
+
BENCHMARK_CAPTURE(cs16_bfly4, neon_x1, xnn_cs16_bfly4_ukernel__neon_x1)
|
98 |
+
->Apply(BenchmarkKernelSize)->UseRealTime();
|
99 |
+
BENCHMARK_CAPTURE(cs16_bfly4, neon_x4, xnn_cs16_bfly4_ukernel__neon_x4)
|
100 |
+
->Apply(BenchmarkKernelSize)->UseRealTime();
|
101 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
102 |
+
|
103 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples1__scalar, xnn_cs16_bfly4_samples1_ukernel__scalar)
|
104 |
+
->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
|
105 |
+
BENCHMARK_CAPTURE(cs16_bfly4, samples4__scalar, xnn_cs16_bfly4_samples4_ukernel__scalar)
|
106 |
+
->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
|
107 |
+
BENCHMARK_CAPTURE(cs16_bfly4, scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)
|
108 |
+
->Apply(BenchmarkKernelSize)->UseRealTime();
|
109 |
+
BENCHMARK_CAPTURE(cs16_bfly4, scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)
|
110 |
+
->Apply(BenchmarkKernelSize)->UseRealTime();
|
111 |
+
BENCHMARK_CAPTURE(cs16_bfly4, scalar_x4, xnn_cs16_bfly4_ukernel__scalar_x4)
|
112 |
+
->Apply(BenchmarkKernelSize)->UseRealTime();
|
113 |
+
|
114 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
115 |
+
BENCHMARK_MAIN();
|
116 |
+
#endif
|
bench/cs16-fftr.cc
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <numeric>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include "bench/utils.h"
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
|
15 |
+
#include <xnnpack.h>
|
16 |
+
#include <xnnpack/aligned-allocator.h>
|
17 |
+
#include <xnnpack/common.h>
|
18 |
+
#include <xnnpack/fft.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
|
22 |
+
|
23 |
+
void cs16_fftr(
|
24 |
+
benchmark::State& state,
|
25 |
+
xnn_cs16_fftr_ukernel_fn fftr,
|
26 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
27 |
+
{
|
28 |
+
if ((isa_check != nullptr) && !isa_check(state)) {
|
29 |
+
return;
|
30 |
+
}
|
31 |
+
const size_t samples = state.range(0);
|
32 |
+
|
33 |
+
assert(samples % 2 == 0);
|
34 |
+
const size_t sample_size = samples * 2 + 2;
|
35 |
+
|
36 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> data(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t));
|
37 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(samples);
|
38 |
+
|
39 |
+
std::iota(data.begin(), data.end(), 0);
|
40 |
+
std::iota(twiddle.begin(), twiddle.end(), 2);
|
41 |
+
|
42 |
+
for (auto _ : state) {
|
43 |
+
fftr(samples, data.data(), twiddle.data());
|
44 |
+
}
|
45 |
+
|
46 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
47 |
+
if (cpu_frequency != 0) {
|
48 |
+
state.counters["cpufreq"] = cpu_frequency;
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
|
53 |
+
{
|
54 |
+
b->ArgNames({"samples"});
|
55 |
+
b->Args({256});
|
56 |
+
b->Args({1024});
|
57 |
+
}
|
58 |
+
#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
|
59 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x1, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
|
60 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x4, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
|
61 |
+
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
|
62 |
+
|
63 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
64 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_neon_x4, xnn_cs16_fftr_ukernel__neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
|
65 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
66 |
+
|
67 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x1, xnn_cs16_fftr_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
|
68 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x2, xnn_cs16_fftr_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
|
69 |
+
BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x4, xnn_cs16_fftr_ukernel__scalar_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
|
70 |
+
|
71 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
72 |
+
BENCHMARK_MAIN();
|
73 |
+
#endif
|
bench/cs16-vsquareabs.cc
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <complex>
|
9 |
+
#include <functional>
|
10 |
+
#include <numeric>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include "bench/utils.h"
|
14 |
+
#include <benchmark/benchmark.h>
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vsquareabs.h>
|
22 |
+
|
23 |
+
|
24 |
+
void cs16_vsquareabs(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_cs16_vsquareabs_ukernel_fn vsquareabs,
|
27 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
28 |
+
{
|
29 |
+
if ((isa_check != nullptr) && !isa_check(state)) {
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
const size_t num_elements = state.range(0);
|
33 |
+
|
34 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(
|
35 |
+
num_elements * 2 + XNN_EXTRA_BYTES / sizeof(int16_t));
|
36 |
+
std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> output(num_elements);
|
37 |
+
std::iota(input.begin(), input.end(), 0);
|
38 |
+
std::iota(output.begin(), output.end(), 0);
|
39 |
+
|
40 |
+
for (auto _ : state) {
|
41 |
+
vsquareabs(num_elements * sizeof(int16_t) * 2, input.data(), output.data());
|
42 |
+
}
|
43 |
+
|
44 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
45 |
+
if (cpu_frequency != 0) {
|
46 |
+
state.counters["cpufreq"] = cpu_frequency;
|
47 |
+
}
|
48 |
+
|
49 |
+
const size_t elements_per_iteration = num_elements;
|
50 |
+
state.counters["elements"] =
|
51 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
52 |
+
|
53 |
+
const size_t bytes_per_iteration = num_elements * (sizeof(std::complex<int16_t>) + sizeof(uint32_t));
|
54 |
+
state.counters["bytes"] =
|
55 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
56 |
+
}
|
57 |
+
|
58 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
59 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x4,
|
60 |
+
xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4,
|
61 |
+
benchmark::utils::CheckNEON)
|
62 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
63 |
+
->UseRealTime();
|
64 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x8,
|
65 |
+
xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8,
|
66 |
+
benchmark::utils::CheckNEON)
|
67 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
68 |
+
->UseRealTime();
|
69 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x12,
|
70 |
+
xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12,
|
71 |
+
benchmark::utils::CheckNEON)
|
72 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
73 |
+
->UseRealTime();
|
74 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x16,
|
75 |
+
xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16,
|
76 |
+
benchmark::utils::CheckNEON)
|
77 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
78 |
+
->UseRealTime();
|
79 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
80 |
+
|
81 |
+
#if XNN_ARCH_HEXAGON
|
82 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x2,
|
83 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x2)
|
84 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
85 |
+
->UseRealTime();
|
86 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x4,
|
87 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x4)
|
88 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
89 |
+
->UseRealTime();
|
90 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x6,
|
91 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x6)
|
92 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
93 |
+
->UseRealTime();
|
94 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x8,
|
95 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x8)
|
96 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
97 |
+
->UseRealTime();
|
98 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x10,
|
99 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x10)
|
100 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
101 |
+
->UseRealTime();
|
102 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x12,
|
103 |
+
xnn_cs16_vsquareabs_ukernel__hexagon_x12)
|
104 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
105 |
+
->UseRealTime();
|
106 |
+
#endif // XNN_ARCH_HEXAGON
|
107 |
+
|
108 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x1,
|
109 |
+
xnn_cs16_vsquareabs_ukernel__scalar_x1)
|
110 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
111 |
+
->UseRealTime();
|
112 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x2,
|
113 |
+
xnn_cs16_vsquareabs_ukernel__scalar_x2)
|
114 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
115 |
+
->UseRealTime();
|
116 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x3,
|
117 |
+
xnn_cs16_vsquareabs_ukernel__scalar_x3)
|
118 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
119 |
+
->UseRealTime();
|
120 |
+
BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x4,
|
121 |
+
xnn_cs16_vsquareabs_ukernel__scalar_x4)
|
122 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
|
123 |
+
->UseRealTime();
|
124 |
+
|
125 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
126 |
+
BENCHMARK_MAIN();
|
127 |
+
#endif
|
bench/dconv.h
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
|
11 |
+
#include <benchmark/benchmark.h>
|
12 |
+
|
13 |
+
|
14 |
+
#define BENCHMARK_DCONV(conv_fn) \
|
15 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1/v2")->Apply(MobileNetConvArguments)->UseRealTime(); \
|
16 |
+
BENCHMARK_CAPTURE(conv_fn, mobilenet_v3, "MobileNet v3")->Apply(MobileNetV3ConvArguments)->UseRealTime(); \
|
17 |
+
BENCHMARK_CAPTURE(conv_fn, shufflenet, "ShuffleNet v1/v2")->Apply(ShuffleNetConvArguments)->UseRealTime(); \
|
18 |
+
BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime();
|
19 |
+
|
20 |
+
|
21 |
+
// ShuffleNet v1/v2.
|
22 |
+
static void ShuffleNetConvArguments(benchmark::internal::Benchmark* b) {
|
23 |
+
b->ArgNames({"H", "W", "Cout"});
|
24 |
+
|
25 |
+
/********* Conv 1 ********/
|
26 |
+
/* H W GCout */
|
27 |
+
b->Args({224, 224, 24});
|
28 |
+
}
|
29 |
+
|
30 |
+
// MobileNet v1/v2.
|
31 |
+
static void MobileNetConvArguments(benchmark::internal::Benchmark* b) {
|
32 |
+
b->ArgNames({"H", "W", "Cout"});
|
33 |
+
|
34 |
+
/* H W GCout */
|
35 |
+
b->Args({224, 224, 32});
|
36 |
+
}
|
37 |
+
|
38 |
+
// MobileNet v3 Small/Large.
|
39 |
+
static void MobileNetV3ConvArguments(benchmark::internal::Benchmark* b) {
|
40 |
+
b->ArgNames({"H", "W", "Cout"});
|
41 |
+
|
42 |
+
/******************* Initial Stage *******************/
|
43 |
+
/* H W GCout */
|
44 |
+
b->Args({224, 224, 16});
|
45 |
+
}
|
46 |
+
|
47 |
+
// SqueezeNet 1.1
|
48 |
+
static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
|
49 |
+
b->ArgNames({"H", "W", "GCout"});
|
50 |
+
|
51 |
+
/*********************** Conv 1 **********************/
|
52 |
+
/* H W GCout */
|
53 |
+
b->Args({224, 224, 64});
|
54 |
+
}
|
bench/deconvolution.cc
ADDED
@@ -0,0 +1,575 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cfloat>
|
9 |
+
#include <cmath>
|
10 |
+
#include <functional>
|
11 |
+
#include <limits>
|
12 |
+
#include <memory>
|
13 |
+
#include <random>
|
14 |
+
#include <string>
|
15 |
+
#include <vector>
|
16 |
+
|
17 |
+
#include <xnnpack.h>
|
18 |
+
|
19 |
+
#include <benchmark/benchmark.h>
|
20 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
21 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
22 |
+
#include "tensorflow/lite/interpreter.h"
|
23 |
+
#include "tensorflow/lite/kernels/register.h"
|
24 |
+
#include "tensorflow/lite/model.h"
|
25 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
26 |
+
#include "tensorflow/lite/version.h"
|
27 |
+
#endif // BENCHMARK_TENSORFLOW_LITE */
|
28 |
+
#include "bench/utils.h"
|
29 |
+
|
30 |
+
void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
|
31 |
+
const size_t batch_size = state.range(0);
|
32 |
+
const size_t input_height = state.range(1);
|
33 |
+
const size_t input_width = state.range(2);
|
34 |
+
const size_t kernel_height = state.range(3);
|
35 |
+
const size_t kernel_width = state.range(4);
|
36 |
+
const size_t padding_height = state.range(5);
|
37 |
+
const size_t padding_width = state.range(6);
|
38 |
+
const size_t adjustment = state.range(7);
|
39 |
+
const size_t stride_height = state.range(8);
|
40 |
+
const size_t stride_width = state.range(9);
|
41 |
+
const size_t dilation = state.range(10);
|
42 |
+
const size_t input_channels = state.range(11);
|
43 |
+
const size_t output_channels = state.range(12);
|
44 |
+
|
45 |
+
std::random_device random_device;
|
46 |
+
auto rng = std::mt19937(random_device());
|
47 |
+
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
|
48 |
+
auto u8rng = std::bind(
|
49 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
|
50 |
+
std::ref(rng));
|
51 |
+
|
52 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
53 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
54 |
+
const size_t padding_left = padding_width / 2;
|
55 |
+
const size_t padding_top = padding_height / 2;
|
56 |
+
const size_t padding_right = padding_width - padding_left;
|
57 |
+
const size_t padding_bottom = padding_height - padding_top;
|
58 |
+
const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
|
59 |
+
const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
|
60 |
+
|
61 |
+
std::vector<uint8_t> input(batch_size * input_height * input_width * input_channels);
|
62 |
+
std::generate(input.begin(), input.end(), std::ref(u8rng));
|
63 |
+
std::vector<uint8_t> kernel(output_channels * kernel_height * kernel_width * input_channels);
|
64 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
|
65 |
+
std::vector<int32_t> bias(output_channels);
|
66 |
+
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
|
67 |
+
const size_t output_elements = batch_size * output_height * output_width * output_channels;
|
68 |
+
|
69 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
70 |
+
if (status != xnn_status_success) {
|
71 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
72 |
+
return;
|
73 |
+
}
|
74 |
+
|
75 |
+
const size_t num_buffers = 1 +
|
76 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
77 |
+
sizeof(float) * (kernel.size() + bias.size() + output_elements));
|
78 |
+
std::vector<uint8_t> output(output_elements * num_buffers);
|
79 |
+
|
80 |
+
std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
|
81 |
+
for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
|
82 |
+
status = xnn_create_deconvolution2d_nhwc_qu8(
|
83 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
84 |
+
kernel_height, kernel_width,
|
85 |
+
stride_height, stride_width,
|
86 |
+
dilation, dilation,
|
87 |
+
/*groups=*/1, input_channels, output_channels,
|
88 |
+
/*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
|
89 |
+
127, 0.5f, 127, 0.5f,
|
90 |
+
kernel.data(), bias.data(),
|
91 |
+
127, 0.5f, 0, 255,
|
92 |
+
0 /* flags */,
|
93 |
+
nullptr, nullptr,
|
94 |
+
&deconvolution_op);
|
95 |
+
if (status != xnn_status_success) {
|
96 |
+
state.SkipWithError("failed to create QINT8 Deconvolution operator");
|
97 |
+
return;
|
98 |
+
}
|
99 |
+
}
|
100 |
+
|
101 |
+
for (size_t i = 0; i < deconvolution_operators.size(); i++) {
|
102 |
+
status = xnn_reshape_deconvolution2d_nhwc_qu8(
|
103 |
+
deconvolution_operators[i],
|
104 |
+
batch_size, input_height, input_width,
|
105 |
+
0 /* height adjustment */, 0 /* width adjustment */,
|
106 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
107 |
+
/*threadpool=*/nullptr);
|
108 |
+
if (status != xnn_status_success) {
|
109 |
+
state.SkipWithError("failed to setup QINT8 Deconvolution operator");
|
110 |
+
return;
|
111 |
+
}
|
112 |
+
}
|
113 |
+
|
114 |
+
for (size_t i = 0; i < deconvolution_operators.size(); i++) {
|
115 |
+
status = xnn_setup_deconvolution2d_nhwc_qu8(
|
116 |
+
deconvolution_operators[i],
|
117 |
+
input.data(), output.data() + i * output_elements);
|
118 |
+
if (status != xnn_status_success) {
|
119 |
+
state.SkipWithError("failed to setup QINT8 Deconvolution operator");
|
120 |
+
return;
|
121 |
+
}
|
122 |
+
}
|
123 |
+
|
124 |
+
size_t buffer_index = 0;
|
125 |
+
for (auto _ : state) {
|
126 |
+
state.PauseTiming();
|
127 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
|
128 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
129 |
+
state.ResumeTiming();
|
130 |
+
|
131 |
+
status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
|
132 |
+
if (status != xnn_status_success) {
|
133 |
+
state.SkipWithError("failed to run QINT8 Deconvolution operator");
|
134 |
+
return;
|
135 |
+
}
|
136 |
+
}
|
137 |
+
|
138 |
+
for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
|
139 |
+
status = xnn_delete_operator(deconvolution_op);
|
140 |
+
if (status != xnn_status_success) {
|
141 |
+
state.SkipWithError("failed to delete QINT8 Deconvolution operator");
|
142 |
+
return;
|
143 |
+
}
|
144 |
+
deconvolution_op = nullptr;
|
145 |
+
}
|
146 |
+
|
147 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
148 |
+
if (cpu_frequency != 0) {
|
149 |
+
state.counters["cpufreq"] = cpu_frequency;
|
150 |
+
}
|
151 |
+
|
152 |
+
state.counters["OPS"] = benchmark::Counter(
|
153 |
+
uint64_t(state.iterations()) * 2 *
|
154 |
+
batch_size * input_width * input_width *
|
155 |
+
input_channels * output_channels *
|
156 |
+
kernel_height * kernel_width,
|
157 |
+
benchmark::Counter::kIsRate);
|
158 |
+
}
|
159 |
+
|
160 |
+
void xnnpack_deconvolution_f32(benchmark::State& state, const char* net) {
|
161 |
+
const size_t batch_size = state.range(0);
|
162 |
+
const size_t input_height = state.range(1);
|
163 |
+
const size_t input_width = state.range(2);
|
164 |
+
const size_t kernel_height = state.range(3);
|
165 |
+
const size_t kernel_width = state.range(4);
|
166 |
+
const size_t padding_height = state.range(5);
|
167 |
+
const size_t padding_width = state.range(6);
|
168 |
+
const size_t adjustment = state.range(7);
|
169 |
+
const size_t stride_height = state.range(8);
|
170 |
+
const size_t stride_width = state.range(9);
|
171 |
+
const size_t dilation = state.range(10);
|
172 |
+
const size_t input_channels = state.range(11);
|
173 |
+
const size_t output_channels = state.range(12);
|
174 |
+
|
175 |
+
std::random_device random_device;
|
176 |
+
auto rng = std::mt19937(random_device());
|
177 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
178 |
+
|
179 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
180 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
181 |
+
const size_t padding_left = padding_width / 2;
|
182 |
+
const size_t padding_top = padding_height / 2;
|
183 |
+
const size_t padding_right = padding_width - padding_left;
|
184 |
+
const size_t padding_bottom = padding_height - padding_top;
|
185 |
+
const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
|
186 |
+
const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
|
187 |
+
|
188 |
+
std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
|
189 |
+
batch_size * input_height * input_width * input_channels);
|
190 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
191 |
+
std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
|
192 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
|
193 |
+
std::vector<float> bias(output_channels);
|
194 |
+
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
|
195 |
+
const size_t output_elements = batch_size * output_height * output_width * output_channels;
|
196 |
+
|
197 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
198 |
+
if (status != xnn_status_success) {
|
199 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
200 |
+
return;
|
201 |
+
}
|
202 |
+
|
203 |
+
const size_t num_buffers = 1 +
|
204 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
205 |
+
sizeof(float) * (kernel.size() + bias.size() + output_elements));
|
206 |
+
std::vector<float> output(output_elements * num_buffers);
|
207 |
+
|
208 |
+
std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
|
209 |
+
for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
|
210 |
+
status = xnn_create_deconvolution2d_nhwc_f32(
|
211 |
+
padding_top, padding_right, padding_bottom, padding_left,
|
212 |
+
kernel_height, kernel_width,
|
213 |
+
stride_height, stride_width,
|
214 |
+
dilation, dilation,
|
215 |
+
/*groups=*/1, input_channels, output_channels,
|
216 |
+
/*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
|
217 |
+
kernel.data(), bias.data(),
|
218 |
+
-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
|
219 |
+
0 /* flags */,
|
220 |
+
nullptr,
|
221 |
+
nullptr,
|
222 |
+
&deconvolution_op);
|
223 |
+
if (status != xnn_status_success) {
|
224 |
+
state.SkipWithError("failed to create FP32 Deconvolution operator");
|
225 |
+
return;
|
226 |
+
}
|
227 |
+
}
|
228 |
+
|
229 |
+
for (size_t i = 0; i < deconvolution_operators.size(); i++) {
|
230 |
+
status = xnn_reshape_deconvolution2d_nhwc_f32(
|
231 |
+
deconvolution_operators[i],
|
232 |
+
batch_size, input_height, input_width,
|
233 |
+
0 /* height adjustment */, 0 /* width adjustment */,
|
234 |
+
/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
|
235 |
+
/*threadpool=*/nullptr);
|
236 |
+
if (status != xnn_status_success) {
|
237 |
+
state.SkipWithError("failed to setup QINT8 Deconvolution operator");
|
238 |
+
return;
|
239 |
+
}
|
240 |
+
}
|
241 |
+
|
242 |
+
for (size_t i = 0; i < deconvolution_operators.size(); i++) {
|
243 |
+
status = xnn_setup_deconvolution2d_nhwc_f32(
|
244 |
+
deconvolution_operators[i],
|
245 |
+
input.data(), output.data() + i * output_elements);
|
246 |
+
if (status != xnn_status_success) {
|
247 |
+
state.SkipWithError("failed to setup QINT8 Deconvolution operator");
|
248 |
+
return;
|
249 |
+
}
|
250 |
+
}
|
251 |
+
|
252 |
+
size_t buffer_index = 0;
|
253 |
+
for (auto _ : state) {
|
254 |
+
state.PauseTiming();
|
255 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
|
256 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
257 |
+
state.ResumeTiming();
|
258 |
+
|
259 |
+
status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
|
260 |
+
if (status != xnn_status_success) {
|
261 |
+
state.SkipWithError("failed to run FP32 Deconvolution operator");
|
262 |
+
return;
|
263 |
+
}
|
264 |
+
}
|
265 |
+
|
266 |
+
for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
|
267 |
+
status = xnn_delete_operator(deconvolution_op);
|
268 |
+
if (status != xnn_status_success) {
|
269 |
+
state.SkipWithError("failed to delete FP32 Deconvolution operator");
|
270 |
+
return;
|
271 |
+
}
|
272 |
+
deconvolution_op = nullptr;
|
273 |
+
}
|
274 |
+
|
275 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
276 |
+
if (cpu_frequency != 0) {
|
277 |
+
state.counters["cpufreq"] = cpu_frequency;
|
278 |
+
}
|
279 |
+
|
280 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
281 |
+
uint64_t(state.iterations()) * 2 *
|
282 |
+
batch_size * input_width * input_width *
|
283 |
+
input_channels * output_channels *
|
284 |
+
kernel_height * kernel_width,
|
285 |
+
benchmark::Counter::kIsRate);
|
286 |
+
}
|
287 |
+
|
288 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
289 |
+
void tflite_deconvolution_f32(benchmark::State& state, const char* net) {
|
290 |
+
const size_t batch_size = state.range(0);
|
291 |
+
const size_t input_height = state.range(1);
|
292 |
+
const size_t input_width = state.range(2);
|
293 |
+
const size_t kernel_height = state.range(3);
|
294 |
+
const size_t kernel_width = state.range(4);
|
295 |
+
const size_t padding_height = state.range(5);
|
296 |
+
const size_t padding_width = state.range(6);
|
297 |
+
const size_t adjustment = state.range(7);
|
298 |
+
const size_t stride_height = state.range(8);
|
299 |
+
const size_t stride_width = state.range(9);
|
300 |
+
const size_t dilation = state.range(10);
|
301 |
+
const size_t input_channels = state.range(11);
|
302 |
+
const size_t output_channels = state.range(12);
|
303 |
+
|
304 |
+
if (dilation != 1) {
|
305 |
+
state.SkipWithError("dilated deconvolution is not supported");
|
306 |
+
return;
|
307 |
+
}
|
308 |
+
|
309 |
+
std::random_device random_device;
|
310 |
+
auto rng = std::mt19937(random_device());
|
311 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
312 |
+
|
313 |
+
tflite::Padding tf_padding = tflite::Padding_VALID;
|
314 |
+
if (padding_width == kernel_width - stride_width && padding_height == kernel_height - stride_height) {
|
315 |
+
tf_padding = tflite::Padding_SAME;
|
316 |
+
} else if (padding_width == 0 && padding_height == 0) {
|
317 |
+
tf_padding = tflite::Padding_VALID;
|
318 |
+
} else {
|
319 |
+
state.SkipWithError("unsupported padding");
|
320 |
+
return;
|
321 |
+
}
|
322 |
+
|
323 |
+
const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + kernel_height, padding_height) - padding_height;
|
324 |
+
const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + kernel_width, padding_width) - padding_width;
|
325 |
+
|
326 |
+
std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
|
327 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
|
328 |
+
|
329 |
+
flatbuffers::FlatBufferBuilder builder;
|
330 |
+
flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
331 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_TRANSPOSE_CONV, 0);
|
332 |
+
|
333 |
+
flatbuffers::Offset<tflite::TransposeConvOptions> transpose_conv_options = CreateTransposeConvOptions(
|
334 |
+
builder,
|
335 |
+
tf_padding,
|
336 |
+
static_cast<int32_t>(stride_width), static_cast<int32_t>(stride_height));
|
337 |
+
|
338 |
+
const std::array<int32_t, 4> input_shape{{
|
339 |
+
static_cast<int32_t>(batch_size),
|
340 |
+
static_cast<int32_t>(input_height),
|
341 |
+
static_cast<int32_t>(input_width),
|
342 |
+
static_cast<int32_t>(input_channels)
|
343 |
+
}};
|
344 |
+
const std::array<int32_t, 4> output_shape{{
|
345 |
+
static_cast<int32_t>(batch_size),
|
346 |
+
static_cast<int32_t>(output_height),
|
347 |
+
static_cast<int32_t>(output_width),
|
348 |
+
static_cast<int32_t>(output_channels)
|
349 |
+
}};
|
350 |
+
const std::array<int32_t, 4> filter_shape{{
|
351 |
+
static_cast<int32_t>(output_channels),
|
352 |
+
static_cast<int32_t>(kernel_height),
|
353 |
+
static_cast<int32_t>(kernel_width),
|
354 |
+
static_cast<int32_t>(input_channels)
|
355 |
+
}};
|
356 |
+
const std::array<int32_t, 1> output_shape_shape{{ 4 }};
|
357 |
+
|
358 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
|
359 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
360 |
+
tflite::CreateBuffer(builder, builder.CreateVector(
|
361 |
+
reinterpret_cast<const uint8_t*>(kernel.data()),
|
362 |
+
sizeof(float) * kernel.size())),
|
363 |
+
tflite::CreateBuffer(builder, builder.CreateVector(
|
364 |
+
reinterpret_cast<const uint8_t*>(output_shape.data()),
|
365 |
+
sizeof(int32_t) * output_shape.size())),
|
366 |
+
}};
|
367 |
+
|
368 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
|
369 |
+
tflite::CreateTensor(builder,
|
370 |
+
builder.CreateVector<int32_t>(output_shape_shape.data(), output_shape_shape.size()),
|
371 |
+
tflite::TensorType_INT32,
|
372 |
+
2 /* buffer id */),
|
373 |
+
tflite::CreateTensor(builder,
|
374 |
+
builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
|
375 |
+
tflite::TensorType_FLOAT32,
|
376 |
+
1 /* buffer id */),
|
377 |
+
tflite::CreateTensor(builder,
|
378 |
+
builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
|
379 |
+
tflite::TensorType_FLOAT32),
|
380 |
+
tflite::CreateTensor(builder,
|
381 |
+
builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
|
382 |
+
tflite::TensorType_FLOAT32),
|
383 |
+
}};
|
384 |
+
|
385 |
+
const std::array<int32_t, 3> op_inputs{{ 0, 1, 2 }};
|
386 |
+
const std::array<int32_t, 1> op_outputs{{ 3 }};
|
387 |
+
flatbuffers::Offset<tflite::Operator> op = CreateOperator(
|
388 |
+
builder,
|
389 |
+
0 /* opcode_index */,
|
390 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
391 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
|
392 |
+
tflite::BuiltinOptions_TransposeConvOptions,
|
393 |
+
transpose_conv_options.Union());
|
394 |
+
|
395 |
+
const std::array<int32_t, 1> graph_inputs{{ 2 }};
|
396 |
+
const std::array<int32_t, 1> graph_outputs{{ 3 }};
|
397 |
+
flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
|
398 |
+
builder,
|
399 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
400 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
401 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
402 |
+
builder.CreateVector(&op, 1),
|
403 |
+
builder.CreateString("TransposeConv subgraph"));
|
404 |
+
|
405 |
+
const flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("TransposeConv model");
|
406 |
+
|
407 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
408 |
+
TFLITE_SCHEMA_VERSION,
|
409 |
+
builder.CreateVector(&operator_code, 1),
|
410 |
+
builder.CreateVector(&subgraph, 1),
|
411 |
+
description,
|
412 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
413 |
+
|
414 |
+
builder.Finish(model_buffer);
|
415 |
+
|
416 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
417 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
418 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
419 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
420 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk) {
|
421 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
422 |
+
return;
|
423 |
+
}
|
424 |
+
if (interpreter == nullptr) {
|
425 |
+
state.SkipWithError("TFLite interpreter is null");
|
426 |
+
return;
|
427 |
+
}
|
428 |
+
interpreter->SetNumThreads(1);
|
429 |
+
|
430 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
431 |
+
state.SkipWithError("failed to allocate tensors");
|
432 |
+
return;
|
433 |
+
}
|
434 |
+
|
435 |
+
std::generate(
|
436 |
+
interpreter->typed_tensor<float>(2),
|
437 |
+
interpreter->typed_tensor<float>(2) + batch_size * input_channels * input_height * input_width,
|
438 |
+
std::ref(f32rng));
|
439 |
+
|
440 |
+
for (auto _ : state) {
|
441 |
+
state.PauseTiming();
|
442 |
+
benchmark::utils::WipeCache();
|
443 |
+
benchmark::utils::PrefetchToL1(
|
444 |
+
interpreter->typed_tensor<float>(2),
|
445 |
+
batch_size * input_channels * input_height * input_width * sizeof(float));
|
446 |
+
state.ResumeTiming();
|
447 |
+
|
448 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
449 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
450 |
+
return;
|
451 |
+
}
|
452 |
+
}
|
453 |
+
|
454 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
455 |
+
if (cpu_frequency != 0) {
|
456 |
+
state.counters["cpufreq"] = cpu_frequency;
|
457 |
+
}
|
458 |
+
|
459 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
460 |
+
uint64_t(state.iterations()) * 2 *
|
461 |
+
batch_size * input_width * input_width *
|
462 |
+
input_channels * output_channels *
|
463 |
+
kernel_height * kernel_width,
|
464 |
+
benchmark::Counter::kIsRate);
|
465 |
+
|
466 |
+
interpreter.reset();
|
467 |
+
}
|
468 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
469 |
+
|
470 |
+
// FCN-32 model (PASCAL VOC version).
|
471 |
+
// We assume CIF image (352x288) on model input / output.
|
472 |
+
static void FCN32(benchmark::internal::Benchmark* b) {
|
473 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
|
474 |
+
|
475 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
476 |
+
b->Args({1, 9, 11, 64, 64, 0, 0, 0, 32, 32, 1, 21, 21});
|
477 |
+
}
|
478 |
+
|
479 |
+
// FCN-16 model (PASCAL VOC version).
|
480 |
+
// We assume CIF image (352x288) on model input / output.
|
481 |
+
static void FCN16(benchmark::internal::Benchmark* b) {
|
482 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
|
483 |
+
|
484 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
485 |
+
b->Args({1, 9, 11, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
|
486 |
+
b->Args({1, 18, 22, 32, 32, 0, 0, 0, 16, 16, 1, 21, 21});
|
487 |
+
}
|
488 |
+
|
489 |
+
// FCN-8 model (PASCAL VOC version).
|
490 |
+
// We assume CIF image (352x288) on model input / output.
|
491 |
+
static void FCN8(benchmark::internal::Benchmark* b) {
|
492 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
|
493 |
+
|
494 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
495 |
+
b->Args({1, 9, 11, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
|
496 |
+
b->Args({1, 18, 22, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
|
497 |
+
b->Args({1, 36, 44, 16, 16, 0, 0, 0, 8, 8, 1, 21, 21});
|
498 |
+
}
|
499 |
+
|
500 |
+
static void ENet(benchmark::internal::Benchmark* b) {
|
501 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
|
502 |
+
|
503 |
+
/*********************** Bottleneck 4.0 ***********************/
|
504 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
505 |
+
b->Args({1, 64, 64, 3, 3, 2, 2, 1, 2, 2, 1, 32, 32});
|
506 |
+
/*********************** Bottleneck 5.0 ***********************/
|
507 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
508 |
+
b->Args({1, 128, 128, 3, 3, 2, 2, 1, 2, 2, 1, 16, 16});
|
509 |
+
/******************* Final Full Convolution *******************/
|
510 |
+
/* N H W KH KW PH PW A SH SH D Cin Cout */
|
511 |
+
b->Args({1, 256, 256, 2, 2, 0, 0, 0, 2, 2, 1, 16, 12});
|
512 |
+
}
|
513 |
+
|
514 |
+
static void ESPNet(benchmark::internal::Benchmark* b) {
|
515 |
+
b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
|
516 |
+
|
517 |
+
/* N H W KH KW PH PW A SH SW D Cin Cout */
|
518 |
+
b->Args({1, 64, 128, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
|
519 |
+
b->Args({1, 128, 256, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
|
520 |
+
b->Args({1, 256, 512, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
|
521 |
+
}
|
522 |
+
|
523 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn32, "FCN-32")
|
524 |
+
->Apply(FCN32)
|
525 |
+
->UseRealTime();
|
526 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn16, "FCN-16")
|
527 |
+
->Apply(FCN16)
|
528 |
+
->UseRealTime();
|
529 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn8, "FCN-8")
|
530 |
+
->Apply(FCN8)
|
531 |
+
->UseRealTime();
|
532 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, enet, "ENet")
|
533 |
+
->Apply(ENet)
|
534 |
+
->UseRealTime();
|
535 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, espnet, "ESPNet")
|
536 |
+
->Apply(ESPNet)
|
537 |
+
->UseRealTime();
|
538 |
+
|
539 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn32, "FCN-32")
|
540 |
+
->Apply(FCN32)
|
541 |
+
->UseRealTime();
|
542 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn16, "FCN-16")
|
543 |
+
->Apply(FCN16)
|
544 |
+
->UseRealTime();
|
545 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn8, "FCN-8")
|
546 |
+
->Apply(FCN8)
|
547 |
+
->UseRealTime();
|
548 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, enet, "ENet")
|
549 |
+
->Apply(ENet)
|
550 |
+
->UseRealTime();
|
551 |
+
BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, espnet, "ESPNet")
|
552 |
+
->Apply(ESPNet)
|
553 |
+
->UseRealTime();
|
554 |
+
|
555 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
556 |
+
BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn32, "FCN-32")
|
557 |
+
->Apply(FCN32)
|
558 |
+
->UseRealTime();
|
559 |
+
BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn16, "FCN-16")
|
560 |
+
->Apply(FCN16)
|
561 |
+
->UseRealTime();
|
562 |
+
BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn8, "FCN-8")
|
563 |
+
->Apply(FCN8)
|
564 |
+
->UseRealTime();
|
565 |
+
BENCHMARK_CAPTURE(tflite_deconvolution_f32, enet, "ENet")
|
566 |
+
->Apply(ENet)
|
567 |
+
->UseRealTime();
|
568 |
+
BENCHMARK_CAPTURE(tflite_deconvolution_f32, espnet, "ESPNet")
|
569 |
+
->Apply(ESPNet)
|
570 |
+
->UseRealTime();
|
571 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
572 |
+
|
573 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
574 |
+
BENCHMARK_MAIN();
|
575 |
+
#endif
|
bench/dwconv.h
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
|
11 |
+
#include <benchmark/benchmark.h>
|
12 |
+
|
13 |
+
|
14 |
+
#define BENCHMARK_DWCONV(dwconv_fn) \
|
15 |
+
BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1DWConvArguments)->UseRealTime(); \
|
16 |
+
BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2DWConvArguments)->UseRealTime(); \
|
17 |
+
BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallDWConvArguments)->UseRealTime(); \
|
18 |
+
BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeDWConvArguments)->UseRealTime(); \
|
19 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1DWConvArguments)->UseRealTime(); \
|
20 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2DWConvArguments)->UseRealTime(); \
|
21 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3DWConvArguments)->UseRealTime(); \
|
22 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4DWConvArguments)->UseRealTime(); \
|
23 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8DWConvArguments)->UseRealTime(); \
|
24 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05DWConvArguments)->UseRealTime(); \
|
25 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10DWConvArguments)->UseRealTime(); \
|
26 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15DWConvArguments)->UseRealTime(); \
|
27 |
+
BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20DWConvArguments)->UseRealTime();
|
28 |
+
|
29 |
+
|
30 |
+
// ShuffleNet v1 with 1 group.
|
31 |
+
static void ShuffleNetV1G1DWConvArguments(benchmark::internal::Benchmark* b) {
|
32 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
33 |
+
|
34 |
+
/********* Stage 2: stride-2 unit *********/
|
35 |
+
/* H W KH KW PH PW S D G */
|
36 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 36});
|
37 |
+
/********* Stage 2: stride-1 units ********/
|
38 |
+
/* H W KH KW PH PW S D G */
|
39 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 36});
|
40 |
+
/********* Stage 3: stride-2 unit *********/
|
41 |
+
/* H W KH KW PH PW S D G */
|
42 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 72});
|
43 |
+
/********* Stage 3: stride-1 units ********/
|
44 |
+
/* H W KH KW PH PW S D G */
|
45 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 72});
|
46 |
+
/********* Stage 4: stride-2 unit *********/
|
47 |
+
/* H W KH KW PH PW S D G */
|
48 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 144});
|
49 |
+
/********* Stage 4: stride-1 units ********/
|
50 |
+
/* H W KH KW PH PW S D G */
|
51 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 144});
|
52 |
+
}
|
53 |
+
|
54 |
+
// ShuffleNet v1 with 2 groups.
|
55 |
+
static void ShuffleNetV1G2DWConvArguments(benchmark::internal::Benchmark* b) {
|
56 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
57 |
+
|
58 |
+
/********* Stage 2: stride-2 unit *********/
|
59 |
+
/* H W KH KW PH PW S D G */
|
60 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 50});
|
61 |
+
/********* Stage 2: stride-1 units ********/
|
62 |
+
/* H W KH KW PH PW S D G */
|
63 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 50});
|
64 |
+
/********* Stage 3: stride-2 unit *********/
|
65 |
+
/* H W KH KW PH PW S D G */
|
66 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 100});
|
67 |
+
/********* Stage 3: stride-1 units ********/
|
68 |
+
/* H W KH KW PH PW S D G */
|
69 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 100});
|
70 |
+
/********* Stage 4: stride-2 unit *********/
|
71 |
+
/* H W KH KW PH PW S D G */
|
72 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 200});
|
73 |
+
/********* Stage 4: stride-1 units ********/
|
74 |
+
/* H W KH KW PH PW S D G */
|
75 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 200});
|
76 |
+
}
|
77 |
+
|
78 |
+
// ShuffleNet v1 with 3 groups.
|
79 |
+
static void ShuffleNetV1G3DWConvArguments(benchmark::internal::Benchmark* b) {
|
80 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
81 |
+
|
82 |
+
/********* Stage 2: stride-2 unit **********/
|
83 |
+
/* H W KH KW PH PW S D G */
|
84 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 60});
|
85 |
+
/********* Stage 2: stride-1 units *********/
|
86 |
+
/* H W KH KW PH PW S D G */
|
87 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 60});
|
88 |
+
/********* Stage 3: stride-2 unit **********/
|
89 |
+
/* H W KH KW PH PW S D G */
|
90 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 120});
|
91 |
+
/********* Stage 3: stride-1 units *********/
|
92 |
+
/* H W KH KW PH PW S D G */
|
93 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 120});
|
94 |
+
/********* Stage 4: stride-2 unit **********/
|
95 |
+
/* H W KH KW PH PW S D G */
|
96 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 240});
|
97 |
+
/********* Stage 4: stride-1 units *********/
|
98 |
+
/* H W KH KW PH PW S D G */
|
99 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 240});
|
100 |
+
}
|
101 |
+
|
102 |
+
// ShuffleNet v1 with 4 groups.
|
103 |
+
static void ShuffleNetV1G4DWConvArguments(benchmark::internal::Benchmark* b) {
|
104 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
105 |
+
|
106 |
+
/********* Stage 2: stride-2 unit *********/
|
107 |
+
/* H W KH KW PH PW S D G */
|
108 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 68});
|
109 |
+
/********* Stage 2: stride-1 units ********/
|
110 |
+
/* H W KH KW PH PW S D G */
|
111 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 68});
|
112 |
+
/********* Stage 3: stride-2 unit *********/
|
113 |
+
/* H W KH KW PH PW S D G */
|
114 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 136});
|
115 |
+
/********* Stage 3: stride-1 units ********/
|
116 |
+
/* H W KH KW PH PW S D G */
|
117 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 136});
|
118 |
+
/********* Stage 4: stride-2 unit *********/
|
119 |
+
/* H W KH KW PH PW S D G */
|
120 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 272});
|
121 |
+
/********* Stage 4: stride-1 units ********/
|
122 |
+
/* H W KH KW PH PW S D G */
|
123 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 272});
|
124 |
+
}
|
125 |
+
|
126 |
+
// ShuffleNet v1 with 8 groups.
|
127 |
+
static void ShuffleNetV1G8DWConvArguments(benchmark::internal::Benchmark* b) {
|
128 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
129 |
+
|
130 |
+
/********* Stage 2: stride-2 unit *********/
|
131 |
+
/* H W KH KW PH PW S D G */
|
132 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 96});
|
133 |
+
/********* Stage 2: stride-1 units ********/
|
134 |
+
/* H W KH KW PH PW S D G */
|
135 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 96});
|
136 |
+
/********* Stage 3: stride-2 unit *********/
|
137 |
+
/* H W KH KW PH PW S D G */
|
138 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 192});
|
139 |
+
/********* Stage 3: stride-1 units ********/
|
140 |
+
/* H W KH KW PH PW S D G */
|
141 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 192});
|
142 |
+
/********* Stage 4: stride-2 unit *********/
|
143 |
+
/* H W KH KW PH PW S D G */
|
144 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 384});
|
145 |
+
/********* Stage 4: stride-1 units ********/
|
146 |
+
/* H W KH KW PH PW S D G */
|
147 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 384});
|
148 |
+
}
|
149 |
+
|
150 |
+
// ShuffleNet v2 (0.5X scale)
|
151 |
+
static void ShuffleNetV2X05DWConvArguments(benchmark::internal::Benchmark* b) {
|
152 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
153 |
+
|
154 |
+
/**************** Stage 2 *****************/
|
155 |
+
/* H W KH KW PH PW S D G */
|
156 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
|
157 |
+
b->Args({28, 28, 3, 3, 2, 2, 1, 1, 24});
|
158 |
+
/**************** Stage 3 *****************/
|
159 |
+
/* H W KH KW PH PW S D G */
|
160 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 48});
|
161 |
+
b->Args({14, 14, 3, 3, 2, 2, 1, 1, 48});
|
162 |
+
/**************** Stage 4 *****************/
|
163 |
+
/* H W KH KW PH PW S D G */
|
164 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 96});
|
165 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 96});
|
166 |
+
}
|
167 |
+
|
168 |
+
// ShuffleNet v2 (1.0X scale)
|
169 |
+
static void ShuffleNetV2X10DWConvArguments(benchmark::internal::Benchmark* b) {
|
170 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
171 |
+
|
172 |
+
/**************** Stage 2 *****************/
|
173 |
+
/* H W KH KW PH PW S D G */
|
174 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
|
175 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 58});
|
176 |
+
b->Args({28, 28, 3, 3, 2, 2, 1, 1, 58});
|
177 |
+
/**************** Stage 3 *****************/
|
178 |
+
/* H W KH KW PH PW S D G */
|
179 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 116});
|
180 |
+
b->Args({14, 14, 3, 3, 2, 2, 1, 1, 116});
|
181 |
+
/**************** Stage 4 *****************/
|
182 |
+
/* H W KH KW PH PW S D G */
|
183 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 232});
|
184 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 232});
|
185 |
+
}
|
186 |
+
|
187 |
+
// ShuffleNet v2 (1.5X scale)
|
188 |
+
static void ShuffleNetV2X15DWConvArguments(benchmark::internal::Benchmark* b) {
|
189 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
190 |
+
|
191 |
+
/**************** Stage 2 *****************/
|
192 |
+
/* H W KH KW PH PW S D G */
|
193 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
|
194 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 88});
|
195 |
+
b->Args({28, 28, 3, 3, 2, 2, 1, 1, 88});
|
196 |
+
/**************** Stage 3 *****************/
|
197 |
+
/* H W KH KW PH PW S D G */
|
198 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 176});
|
199 |
+
b->Args({14, 14, 3, 3, 2, 2, 1, 1, 176});
|
200 |
+
/**************** Stage 4 *****************/
|
201 |
+
/* H W KH KW PH PW S D G */
|
202 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 352});
|
203 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 352});
|
204 |
+
}
|
205 |
+
|
206 |
+
// ShuffleNet v2 (2.0X scale)
|
207 |
+
static void ShuffleNetV2X20DWConvArguments(benchmark::internal::Benchmark* b) {
|
208 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
209 |
+
|
210 |
+
/***************** Stage 2 ****************/
|
211 |
+
/* H W KH KW PH PW S D G */
|
212 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
|
213 |
+
b->Args({56, 56, 3, 3, 2, 2, 2, 1, 122});
|
214 |
+
b->Args({28, 28, 3, 3, 2, 2, 1, 1, 122});
|
215 |
+
/***************** Stage 3 ****************/
|
216 |
+
/* H W KH KW PH PW S D G */
|
217 |
+
b->Args({28, 28, 3, 3, 2, 2, 2, 1, 244});
|
218 |
+
b->Args({14, 14, 3, 3, 2, 2, 1, 1, 244});
|
219 |
+
/***************** Stage 4 ****************/
|
220 |
+
/* H W KH KW PH PW S D G */
|
221 |
+
b->Args({14, 14, 3, 3, 2, 2, 2, 1, 488});
|
222 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 488});
|
223 |
+
}
|
224 |
+
|
225 |
+
static void MobileNetV1DWConvArguments(benchmark::internal::Benchmark* b) {
|
226 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
227 |
+
|
228 |
+
/* H W KH KW PH PW S D G */
|
229 |
+
b->Args({112, 112, 3, 3, 2, 2, 1, 1, 32});
|
230 |
+
b->Args({112, 112, 3, 3, 2, 2, 2, 1, 64});
|
231 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 128});
|
232 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 128});
|
233 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 256});
|
234 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 256});
|
235 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 512});
|
236 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 512});
|
237 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 1024});
|
238 |
+
}
|
239 |
+
|
240 |
+
static void MobileNetV2DWConvArguments(benchmark::internal::Benchmark* b) {
|
241 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
242 |
+
|
243 |
+
/**************** Bottleneck 1 ***************/
|
244 |
+
/* H W KH KW PH PW S D G */
|
245 |
+
b->Args({112, 112, 3, 3, 2, 2, 1, 1, 32});
|
246 |
+
|
247 |
+
/**************** Bottleneck 2 ***************/
|
248 |
+
/* H W KH KW PH PW S D G */
|
249 |
+
b->Args({112, 112, 3, 3, 2, 2, 2, 1, 96});
|
250 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 144});
|
251 |
+
|
252 |
+
/**************** Bottleneck 3 ***************/
|
253 |
+
/* H W KH KW PH PW S D G */
|
254 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 144});
|
255 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 192});
|
256 |
+
//b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 192});
|
257 |
+
|
258 |
+
/**************** Bottleneck 4 ***************/
|
259 |
+
/* H W KH KW PH PW S D G */
|
260 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 192});
|
261 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
|
262 |
+
//b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
|
263 |
+
//b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
|
264 |
+
|
265 |
+
/**************** Bottleneck 5 ***************/
|
266 |
+
/* H W KH KW PH PW S D G */
|
267 |
+
//b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
|
268 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 576});
|
269 |
+
//b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 576});
|
270 |
+
|
271 |
+
/**************** Bottleneck 6 ***************/
|
272 |
+
/* H W KH KW PH PW S D G */
|
273 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 576});
|
274 |
+
b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
|
275 |
+
//b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
|
276 |
+
|
277 |
+
/**************** Bottleneck 7 ***************/
|
278 |
+
/* H W KH KW PH PW S D G */
|
279 |
+
//b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
|
280 |
+
}
|
281 |
+
|
282 |
+
static void MobileNetV3SmallDWConvArguments(benchmark::internal::Benchmark* b) {
|
283 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
284 |
+
|
285 |
+
/*************** Bottleneck 1 ***************/
|
286 |
+
/* H W KH KW PH PW S D G */
|
287 |
+
b->Args({112, 112, 3, 3, 2, 2, 2, 1, 16});
|
288 |
+
/*************** Bottleneck 2 ***************/
|
289 |
+
/* H W KH KW PH PW S D G */
|
290 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 72});
|
291 |
+
/*************** Bottleneck 3 ***************/
|
292 |
+
/* H W KH KW PH PW S D G */
|
293 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 88});
|
294 |
+
/*************** Bottleneck 4 ***************/
|
295 |
+
/* H W KH KW PH PW S D G */
|
296 |
+
b->Args({ 28, 28, 5, 5, 4, 4, 2, 1, 96});
|
297 |
+
/*************** Bottleneck 5 ***************/
|
298 |
+
/* H W KH KW PH PW S D G */
|
299 |
+
b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 240});
|
300 |
+
/*************** Bottleneck 6 ***************/
|
301 |
+
/* H W KH KW PH PW S D G */
|
302 |
+
//b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 240});
|
303 |
+
/*************** Bottleneck 7 ***************/
|
304 |
+
/* H W KH KW PH PW S D G */
|
305 |
+
b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 120});
|
306 |
+
/*************** Bottleneck 8 ***************/
|
307 |
+
/* H W KH KW PH PW S D G */
|
308 |
+
b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 144});
|
309 |
+
/*************** Bottleneck 9 ***************/
|
310 |
+
/* H W KH KW PH PW S D G */
|
311 |
+
b->Args({ 14, 14, 5, 5, 4, 4, 2, 1, 288});
|
312 |
+
/*************** Bottleneck 10 **************/
|
313 |
+
/* H W KH KW PH PW S D G */
|
314 |
+
b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 576});
|
315 |
+
/*************** Bottleneck 11 **************/
|
316 |
+
/* H W KH KW PH PW S D G */
|
317 |
+
//b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 576});
|
318 |
+
}
|
319 |
+
|
320 |
+
static void MobileNetV3LargeDWConvArguments(benchmark::internal::Benchmark* b) {
|
321 |
+
b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
|
322 |
+
|
323 |
+
/*************** Bottleneck 1 ***************/
|
324 |
+
/* H W KH KW PH PW S D G */
|
325 |
+
b->Args({112, 112, 3, 3, 2, 2, 1, 1, 16});
|
326 |
+
/*************** Bottleneck 2 ***************/
|
327 |
+
/* H W KH KW PH PW S D G */
|
328 |
+
b->Args({112, 112, 3, 3, 2, 2, 2, 1, 64});
|
329 |
+
/*************** Bottleneck 3 ***************/
|
330 |
+
/* H W KH KW PH PW S D G */
|
331 |
+
b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 72});
|
332 |
+
/*************** Bottleneck 4 ***************/
|
333 |
+
/* H W KH KW PH PW S D G */
|
334 |
+
b->Args({ 56, 56, 5, 5, 4, 4, 2, 1, 72});
|
335 |
+
/*************** Bottleneck 5 ***************/
|
336 |
+
/* H W KH KW PH PW S D G */
|
337 |
+
b->Args({ 28, 28, 5, 5, 4, 4, 1, 1, 120});
|
338 |
+
/*************** Bottleneck 6 ***************/
|
339 |
+
/* H W KH KW PH PW S D G */
|
340 |
+
//b->Args({ 28, 28, 5, 5, 4, 4, 1, 1, 120});
|
341 |
+
/*************** Bottleneck 7 ***************/
|
342 |
+
/* H W KH KW PH PW S D G */
|
343 |
+
b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 240});
|
344 |
+
/*************** Bottleneck 8 ***************/
|
345 |
+
/* H W KH KW PH PW S D G */
|
346 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 200});
|
347 |
+
/*************** Bottleneck 9 ***************/
|
348 |
+
/* H W KH KW PH PW S D G */
|
349 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 184});
|
350 |
+
/*************** Bottleneck 10 **************/
|
351 |
+
/* H W KH KW PH PW S D G */
|
352 |
+
//b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 184});
|
353 |
+
/*************** Bottleneck 11 **************/
|
354 |
+
/* H W KH KW PH PW S D G */
|
355 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 480});
|
356 |
+
/*************** Bottleneck 12 **************/
|
357 |
+
/* H W KH KW PH PW S D G */
|
358 |
+
b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 672});
|
359 |
+
/*************** Bottleneck 13 **************/
|
360 |
+
/* H W KH KW PH PW S D G */
|
361 |
+
b->Args({ 14, 14, 5, 5, 4, 4, 2, 1, 672});
|
362 |
+
/*************** Bottleneck 14 **************/
|
363 |
+
/* H W KH KW PH PW S D G */
|
364 |
+
b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 960});
|
365 |
+
/*************** Bottleneck 15 **************/
|
366 |
+
/* H W KH KW PH PW S D G */
|
367 |
+
//b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 960});
|
368 |
+
}
|
bench/elu.cc
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2020 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <array>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <limits>
|
11 |
+
#include <memory>
|
12 |
+
#include <random>
|
13 |
+
#include <vector>
|
14 |
+
|
15 |
+
#include <xnnpack.h>
|
16 |
+
|
17 |
+
#include <fp16/fp16.h>
|
18 |
+
#include "bench/utils.h"
|
19 |
+
#include <benchmark/benchmark.h>
|
20 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
21 |
+
#include "flatbuffers/include/flatbuffers/flatbuffers.h"
|
22 |
+
#include "tensorflow/lite/interpreter.h"
|
23 |
+
#include "tensorflow/lite/kernels/register.h"
|
24 |
+
#include "tensorflow/lite/model.h"
|
25 |
+
#include "tensorflow/lite/schema/schema_generated.h"
|
26 |
+
#include "tensorflow/lite/version.h"
|
27 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
28 |
+
|
29 |
+
|
30 |
+
static void xnnpack_elu_f16(benchmark::State& state) {
|
31 |
+
const size_t batch_size = state.range(0);
|
32 |
+
|
33 |
+
std::random_device random_device;
|
34 |
+
auto rng = std::mt19937(random_device());
|
35 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
|
36 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
37 |
+
|
38 |
+
std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
39 |
+
std::vector<uint16_t> output(batch_size);
|
40 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
41 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
42 |
+
|
43 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
44 |
+
if (status != xnn_status_success) {
|
45 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
46 |
+
return;
|
47 |
+
}
|
48 |
+
|
49 |
+
xnn_operator_t elu_op = nullptr;
|
50 |
+
status = xnn_create_elu_nc_f16(
|
51 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
52 |
+
1.0f /* alpha */, 0 /* flags */, &elu_op);
|
53 |
+
if (status != xnn_status_success || elu_op == nullptr) {
|
54 |
+
state.SkipWithError("failed to create ELU operator");
|
55 |
+
return;
|
56 |
+
}
|
57 |
+
|
58 |
+
status = xnn_reshape_elu_nc_f16(elu_op, batch_size, /*threadpool=*/nullptr);
|
59 |
+
if (status != xnn_status_success) {
|
60 |
+
state.SkipWithError("failed to reshape ELU operator");
|
61 |
+
return;
|
62 |
+
}
|
63 |
+
|
64 |
+
status = xnn_setup_elu_nc_f16(elu_op, input.data(), output.data());
|
65 |
+
if (status != xnn_status_success) {
|
66 |
+
state.SkipWithError("failed to setup ELU operator");
|
67 |
+
return;
|
68 |
+
}
|
69 |
+
|
70 |
+
for (auto _ : state) {
|
71 |
+
status = xnn_run_operator(elu_op, nullptr /* thread pool */);
|
72 |
+
if (status != xnn_status_success) {
|
73 |
+
state.SkipWithError("failed to run ELU operator");
|
74 |
+
return;
|
75 |
+
}
|
76 |
+
}
|
77 |
+
|
78 |
+
status = xnn_delete_operator(elu_op);
|
79 |
+
if (status != xnn_status_success) {
|
80 |
+
state.SkipWithError("failed to delete ELU operator");
|
81 |
+
return;
|
82 |
+
}
|
83 |
+
|
84 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
85 |
+
if (cpu_frequency != 0) {
|
86 |
+
state.counters["cpufreq"] = cpu_frequency;
|
87 |
+
}
|
88 |
+
|
89 |
+
state.counters["elements"] =
|
90 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
91 |
+
|
92 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
|
93 |
+
state.counters["bytes"] =
|
94 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
95 |
+
}
|
96 |
+
|
97 |
+
static void xnnpack_elu_f32(benchmark::State& state) {
|
98 |
+
const size_t batch_size = state.range(0);
|
99 |
+
|
100 |
+
std::random_device random_device;
|
101 |
+
auto rng = std::mt19937(random_device());
|
102 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
|
103 |
+
|
104 |
+
std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
|
105 |
+
std::vector<float> output(batch_size);
|
106 |
+
std::generate(input.begin(), input.end(), std::ref(f32rng));
|
107 |
+
std::fill(output.begin(), output.end(), std::nanf(""));
|
108 |
+
|
109 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
110 |
+
if (status != xnn_status_success) {
|
111 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
112 |
+
return;
|
113 |
+
}
|
114 |
+
|
115 |
+
xnn_operator_t elu_op = nullptr;
|
116 |
+
status = xnn_create_elu_nc_f32(
|
117 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
118 |
+
1.0f /* alpha */, 0 /* flags */, &elu_op);
|
119 |
+
if (status != xnn_status_success || elu_op == nullptr) {
|
120 |
+
state.SkipWithError("failed to create ELU operator");
|
121 |
+
return;
|
122 |
+
}
|
123 |
+
|
124 |
+
status = xnn_reshape_elu_nc_f32(elu_op, batch_size, /*threadpool=*/nullptr);
|
125 |
+
if (status != xnn_status_success) {
|
126 |
+
state.SkipWithError("failed to reshape ELU operator");
|
127 |
+
return;
|
128 |
+
}
|
129 |
+
|
130 |
+
status = xnn_setup_elu_nc_f32(elu_op, input.data(), output.data());
|
131 |
+
if (status != xnn_status_success) {
|
132 |
+
state.SkipWithError("failed to setup ELU operator");
|
133 |
+
return;
|
134 |
+
}
|
135 |
+
|
136 |
+
for (auto _ : state) {
|
137 |
+
status = xnn_run_operator(elu_op, nullptr /* thread pool */);
|
138 |
+
if (status != xnn_status_success) {
|
139 |
+
state.SkipWithError("failed to run ELU operator");
|
140 |
+
return;
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
status = xnn_delete_operator(elu_op);
|
145 |
+
if (status != xnn_status_success) {
|
146 |
+
state.SkipWithError("failed to delete ELU operator");
|
147 |
+
return;
|
148 |
+
}
|
149 |
+
|
150 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
151 |
+
if (cpu_frequency != 0) {
|
152 |
+
state.counters["cpufreq"] = cpu_frequency;
|
153 |
+
}
|
154 |
+
|
155 |
+
state.counters["elements"] =
|
156 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
157 |
+
|
158 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
159 |
+
state.counters["bytes"] =
|
160 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
161 |
+
}
|
162 |
+
|
163 |
+
static void xnnpack_elu_qs8(benchmark::State& state) {
|
164 |
+
const size_t batch_size = state.range(0);
|
165 |
+
|
166 |
+
std::random_device random_device;
|
167 |
+
auto rng = std::mt19937(random_device());
|
168 |
+
auto i8rng = std::bind(
|
169 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
170 |
+
std::ref(rng));
|
171 |
+
|
172 |
+
std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
|
173 |
+
std::vector<int8_t> output(batch_size);
|
174 |
+
std::generate(input.begin(), input.end(), std::ref(i8rng));
|
175 |
+
std::fill(output.begin(), output.end(), INT8_C(0xA5));
|
176 |
+
|
177 |
+
xnn_status status = xnn_initialize(nullptr /* allocator */);
|
178 |
+
if (status != xnn_status_success) {
|
179 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
180 |
+
return;
|
181 |
+
}
|
182 |
+
|
183 |
+
xnn_operator_t elu_op = nullptr;
|
184 |
+
status = xnn_create_elu_nc_qs8(
|
185 |
+
1 /* channels */, 1 /* input stride */, 1 /* output stride */,
|
186 |
+
1.0f /* alpha */,
|
187 |
+
0 /* input zero point */, 1.0f /* input scale */,
|
188 |
+
0 /* output zero point */, 1.0f /* output scale */,
|
189 |
+
std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
|
190 |
+
0 /* flags */, &elu_op);
|
191 |
+
if (status != xnn_status_success || elu_op == nullptr) {
|
192 |
+
state.SkipWithError("failed to create ELU operator");
|
193 |
+
return;
|
194 |
+
}
|
195 |
+
|
196 |
+
status = xnn_reshape_elu_nc_qs8(elu_op, batch_size, /*threadpool=*/nullptr);
|
197 |
+
if (status != xnn_status_success) {
|
198 |
+
state.SkipWithError("failed to reshape ELU operator");
|
199 |
+
return;
|
200 |
+
}
|
201 |
+
|
202 |
+
status = xnn_setup_elu_nc_qs8(elu_op, input.data(), output.data());
|
203 |
+
if (status != xnn_status_success) {
|
204 |
+
state.SkipWithError("failed to setup ELU operator");
|
205 |
+
return;
|
206 |
+
}
|
207 |
+
|
208 |
+
for (auto _ : state) {
|
209 |
+
status = xnn_run_operator(elu_op, nullptr /* thread pool */);
|
210 |
+
if (status != xnn_status_success) {
|
211 |
+
state.SkipWithError("failed to run ELU operator");
|
212 |
+
return;
|
213 |
+
}
|
214 |
+
}
|
215 |
+
|
216 |
+
status = xnn_delete_operator(elu_op);
|
217 |
+
if (status != xnn_status_success) {
|
218 |
+
state.SkipWithError("failed to delete ELU operator");
|
219 |
+
return;
|
220 |
+
}
|
221 |
+
|
222 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
223 |
+
if (cpu_frequency != 0) {
|
224 |
+
state.counters["cpufreq"] = cpu_frequency;
|
225 |
+
}
|
226 |
+
|
227 |
+
state.counters["elements"] =
|
228 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
229 |
+
|
230 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
|
231 |
+
state.counters["bytes"] =
|
232 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
233 |
+
}
|
234 |
+
|
235 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
236 |
+
static void tflite_elu_f32(benchmark::State& state) {
|
237 |
+
const size_t batch_size = state.range(0);
|
238 |
+
|
239 |
+
std::random_device random_device;
|
240 |
+
auto rng = std::mt19937(random_device());
|
241 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
|
242 |
+
|
243 |
+
flatbuffers::FlatBufferBuilder builder;
|
244 |
+
const flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
245 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
|
246 |
+
|
247 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
248 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
249 |
+
}};
|
250 |
+
|
251 |
+
const std::array<int32_t, 1> shape{{
|
252 |
+
static_cast<int32_t>(batch_size)
|
253 |
+
}};
|
254 |
+
|
255 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
256 |
+
tflite::CreateTensor(builder,
|
257 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
258 |
+
tflite::TensorType_FLOAT32),
|
259 |
+
tflite::CreateTensor(builder,
|
260 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
261 |
+
tflite::TensorType_FLOAT32),
|
262 |
+
}};
|
263 |
+
|
264 |
+
const std::array<int32_t, 1> op_inputs{{ 0 }};
|
265 |
+
const std::array<int32_t, 1> op_outputs{{ 1 }};
|
266 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
|
267 |
+
builder,
|
268 |
+
0 /* opcode_index */,
|
269 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
270 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
271 |
+
|
272 |
+
const std::array<int32_t, 1> graph_inputs{{ 0 }};
|
273 |
+
const std::array<int32_t, 1> graph_outputs{{ 1 }};
|
274 |
+
const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
275 |
+
builder,
|
276 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
277 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
278 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
279 |
+
builder.CreateVector(&op, 1));
|
280 |
+
|
281 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
282 |
+
TFLITE_SCHEMA_VERSION,
|
283 |
+
builder.CreateVector(&operator_code, 1),
|
284 |
+
builder.CreateVector(&subgraph, 1),
|
285 |
+
builder.CreateString("ELU model"),
|
286 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
287 |
+
|
288 |
+
builder.Finish(model_buffer);
|
289 |
+
|
290 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
291 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
292 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
293 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
294 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
295 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
296 |
+
return;
|
297 |
+
}
|
298 |
+
interpreter->SetNumThreads(1);
|
299 |
+
|
300 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
301 |
+
state.SkipWithError("failed to allocate tensors");
|
302 |
+
return;
|
303 |
+
}
|
304 |
+
|
305 |
+
std::generate(
|
306 |
+
interpreter->typed_tensor<float>(0),
|
307 |
+
interpreter->typed_tensor<float>(0) + batch_size,
|
308 |
+
std::ref(f32rng));
|
309 |
+
|
310 |
+
for (auto _ : state) {
|
311 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
312 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
313 |
+
return;
|
314 |
+
}
|
315 |
+
}
|
316 |
+
|
317 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
318 |
+
if (cpu_frequency != 0) {
|
319 |
+
state.counters["cpufreq"] = cpu_frequency;
|
320 |
+
}
|
321 |
+
|
322 |
+
state.counters["elements"] =
|
323 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
324 |
+
|
325 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
|
326 |
+
state.counters["bytes"] =
|
327 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
328 |
+
|
329 |
+
interpreter.reset();
|
330 |
+
}
|
331 |
+
|
332 |
+
static void tflite_elu_qs8(benchmark::State& state) {
|
333 |
+
const size_t batch_size = state.range(0);
|
334 |
+
|
335 |
+
std::random_device random_device;
|
336 |
+
auto rng = std::mt19937(random_device());
|
337 |
+
auto i8rng = std::bind(
|
338 |
+
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
|
339 |
+
std::ref(rng));
|
340 |
+
|
341 |
+
flatbuffers::FlatBufferBuilder builder;
|
342 |
+
const flatbuffers::Offset<tflite::OperatorCode> operator_code =
|
343 |
+
CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
|
344 |
+
|
345 |
+
const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
|
346 |
+
tflite::CreateBuffer(builder, builder.CreateVector({})),
|
347 |
+
}};
|
348 |
+
|
349 |
+
const std::array<int32_t, 1> shape{{
|
350 |
+
static_cast<int32_t>(batch_size)
|
351 |
+
}};
|
352 |
+
|
353 |
+
const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
|
354 |
+
tflite::CreateTensor(builder,
|
355 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
356 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
357 |
+
tflite::CreateQuantizationParameters(builder,
|
358 |
+
0 /*min*/, 0 /*max*/,
|
359 |
+
builder.CreateVector<float>({1.0f /* scale */}),
|
360 |
+
builder.CreateVector<int64_t>({1 /* zero point */}))),
|
361 |
+
tflite::CreateTensor(builder,
|
362 |
+
builder.CreateVector<int32_t>(shape.data(), shape.size()),
|
363 |
+
tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
|
364 |
+
tflite::CreateQuantizationParameters(builder,
|
365 |
+
0 /*min*/, 0 /*max*/,
|
366 |
+
builder.CreateVector<float>({1.0f /* scale */}),
|
367 |
+
builder.CreateVector<int64_t>({1 /* zero point */}))),
|
368 |
+
}};
|
369 |
+
|
370 |
+
const std::array<int32_t, 1> op_inputs{{ 0 }};
|
371 |
+
const std::array<int32_t, 1> op_outputs{{ 1 }};
|
372 |
+
flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
|
373 |
+
builder,
|
374 |
+
0 /* opcode_index */,
|
375 |
+
builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
|
376 |
+
builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
|
377 |
+
|
378 |
+
const std::array<int32_t, 1> graph_inputs{{ 0 }};
|
379 |
+
const std::array<int32_t, 1> graph_outputs{{ 1 }};
|
380 |
+
const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
|
381 |
+
builder,
|
382 |
+
builder.CreateVector(tensors.data(), tensors.size()),
|
383 |
+
builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
|
384 |
+
builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
|
385 |
+
builder.CreateVector(&op, 1));
|
386 |
+
|
387 |
+
const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
|
388 |
+
TFLITE_SCHEMA_VERSION,
|
389 |
+
builder.CreateVector(&operator_code, 1),
|
390 |
+
builder.CreateVector(&subgraph, 1),
|
391 |
+
builder.CreateString("ELU model"),
|
392 |
+
builder.CreateVector(buffers.data(), buffers.size()));
|
393 |
+
|
394 |
+
builder.Finish(model_buffer);
|
395 |
+
|
396 |
+
const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
|
397 |
+
tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
|
398 |
+
tflite::InterpreterBuilder interpreterBuilder(model, resolver);
|
399 |
+
std::unique_ptr<tflite::Interpreter> interpreter;
|
400 |
+
if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
|
401 |
+
state.SkipWithError("failed to create TFLite interpreter");
|
402 |
+
return;
|
403 |
+
}
|
404 |
+
interpreter->SetNumThreads(1);
|
405 |
+
|
406 |
+
if (interpreter->AllocateTensors() != kTfLiteOk) {
|
407 |
+
state.SkipWithError("failed to allocate tensors");
|
408 |
+
return;
|
409 |
+
}
|
410 |
+
|
411 |
+
std::generate(
|
412 |
+
interpreter->typed_tensor<int8_t>(0),
|
413 |
+
interpreter->typed_tensor<int8_t>(0) + batch_size,
|
414 |
+
std::ref(i8rng));
|
415 |
+
|
416 |
+
for (auto _ : state) {
|
417 |
+
if (interpreter->Invoke() != kTfLiteOk) {
|
418 |
+
state.SkipWithError("failed to invoke TFLite interpreter");
|
419 |
+
return;
|
420 |
+
}
|
421 |
+
}
|
422 |
+
|
423 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
424 |
+
if (cpu_frequency != 0) {
|
425 |
+
state.counters["cpufreq"] = cpu_frequency;
|
426 |
+
}
|
427 |
+
|
428 |
+
state.counters["elements"] =
|
429 |
+
benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
|
430 |
+
|
431 |
+
const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
|
432 |
+
state.counters["bytes"] =
|
433 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
434 |
+
|
435 |
+
interpreter.reset();
|
436 |
+
}
|
437 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
438 |
+
|
439 |
+
BENCHMARK(xnnpack_elu_f16)
|
440 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
441 |
+
->UseRealTime();
|
442 |
+
BENCHMARK(xnnpack_elu_f32)
|
443 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
444 |
+
->UseRealTime();
|
445 |
+
BENCHMARK(xnnpack_elu_qs8)
|
446 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
|
447 |
+
->UseRealTime();
|
448 |
+
|
449 |
+
#ifdef BENCHMARK_TENSORFLOW_LITE
|
450 |
+
BENCHMARK(tflite_elu_f32)
|
451 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
|
452 |
+
->UseRealTime();
|
453 |
+
BENCHMARK(tflite_elu_qs8)
|
454 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
|
455 |
+
->UseRealTime();
|
456 |
+
#endif // BENCHMARK_TENSORFLOW_LITE
|
457 |
+
|
458 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
459 |
+
BENCHMARK_MAIN();
|
460 |
+
#endif
|
bench/end2end.cc
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <memory>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
|
15 |
+
#include "bench/utils.h"
|
16 |
+
|
17 |
+
#include <xnnpack.h>
|
18 |
+
#include <xnnpack/models.h>
|
19 |
+
|
20 |
+
|
21 |
+
static void End2EndBenchmark(
|
22 |
+
benchmark::State& state,
|
23 |
+
models::ExecutionPlanFactory model_factory)
|
24 |
+
{
|
25 |
+
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
|
26 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
27 |
+
return;
|
28 |
+
}
|
29 |
+
|
30 |
+
const size_t num_threads = state.range(0);
|
31 |
+
std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
|
32 |
+
pthreadpool_create(num_threads), pthreadpool_destroy);
|
33 |
+
|
34 |
+
auto execution_plan = model_factory(threadpool.get());
|
35 |
+
if (execution_plan.empty()) {
|
36 |
+
state.SkipWithError("failed to create a model");
|
37 |
+
return;
|
38 |
+
}
|
39 |
+
|
40 |
+
for (auto _ : state) {
|
41 |
+
for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
|
42 |
+
xnn_status status = xnn_run_operator(op.get(), threadpool.get());
|
43 |
+
if (status != xnn_status_success) {
|
44 |
+
state.SkipWithError("failed to run a model");
|
45 |
+
return;
|
46 |
+
}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
51 |
+
if (cpu_frequency != 0) {
|
52 |
+
state.counters["cpufreq"] = cpu_frequency;
|
53 |
+
}
|
54 |
+
}
|
55 |
+
|
56 |
+
static void FP32MobileNetV1(benchmark::State& state) {
|
57 |
+
End2EndBenchmark(state, models::FP32MobileNetV1);
|
58 |
+
}
|
59 |
+
|
60 |
+
static void FP32MobileNetV2(benchmark::State& state) {
|
61 |
+
End2EndBenchmark(state, models::FP32MobileNetV2);
|
62 |
+
}
|
63 |
+
|
64 |
+
static void FP32MobileNetV3Large(benchmark::State& state) {
|
65 |
+
End2EndBenchmark(state, models::FP32MobileNetV3Large);
|
66 |
+
}
|
67 |
+
|
68 |
+
static void FP32MobileNetV3Small(benchmark::State& state) {
|
69 |
+
End2EndBenchmark(state, models::FP32MobileNetV3Small);
|
70 |
+
}
|
71 |
+
|
72 |
+
#if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
|
73 |
+
static void FP32MobileNetV3SmallFused(benchmark::State& state) {
|
74 |
+
End2EndBenchmark(state, models::FP32MobileNetV3SmallFused);
|
75 |
+
}
|
76 |
+
#endif // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
|
77 |
+
|
78 |
+
static void FP32Sparse80MobileNetV1(benchmark::State& state) {
|
79 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
80 |
+
return models::FP32SparseMobileNetV1(0.8f, threadpool);
|
81 |
+
});
|
82 |
+
}
|
83 |
+
|
84 |
+
static void FP32Sparse80MobileNetV2(benchmark::State& state) {
|
85 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
86 |
+
return models::FP32SparseMobileNetV2(0.8f, threadpool);
|
87 |
+
});
|
88 |
+
}
|
89 |
+
|
90 |
+
static void FP32Sparse80MobileNetV3Large(benchmark::State& state) {
|
91 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
92 |
+
return models::FP32SparseMobileNetV3Large(0.8f, threadpool);
|
93 |
+
});
|
94 |
+
}
|
95 |
+
|
96 |
+
static void FP32Sparse80MobileNetV3Small(benchmark::State& state) {
|
97 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
98 |
+
return models::FP32SparseMobileNetV3Small(0.8f, threadpool);
|
99 |
+
});
|
100 |
+
}
|
101 |
+
|
102 |
+
static void FP16MobileNetV1(benchmark::State& state) {
|
103 |
+
End2EndBenchmark(state, models::FP16MobileNetV1);
|
104 |
+
}
|
105 |
+
|
106 |
+
static void FP16MobileNetV2(benchmark::State& state) {
|
107 |
+
End2EndBenchmark(state, models::FP16MobileNetV2);
|
108 |
+
}
|
109 |
+
|
110 |
+
static void FP16MobileNetV3Large(benchmark::State& state) {
|
111 |
+
End2EndBenchmark(state, models::FP16MobileNetV3Large);
|
112 |
+
}
|
113 |
+
|
114 |
+
static void FP16MobileNetV3Small(benchmark::State& state) {
|
115 |
+
End2EndBenchmark(state, models::FP16MobileNetV3Small);
|
116 |
+
}
|
117 |
+
|
118 |
+
static void FP16Sparse80MobileNetV1(benchmark::State& state) {
|
119 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
120 |
+
return models::FP16SparseMobileNetV1(0.8f, threadpool);
|
121 |
+
});
|
122 |
+
}
|
123 |
+
|
124 |
+
static void FP16Sparse80MobileNetV2(benchmark::State& state) {
|
125 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
126 |
+
return models::FP16SparseMobileNetV2(0.8f, threadpool);
|
127 |
+
});
|
128 |
+
}
|
129 |
+
|
130 |
+
static void FP16Sparse80MobileNetV3Large(benchmark::State& state) {
|
131 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
132 |
+
return models::FP16SparseMobileNetV3Large(0.8f, threadpool);
|
133 |
+
});
|
134 |
+
}
|
135 |
+
|
136 |
+
static void FP16Sparse80MobileNetV3Small(benchmark::State& state) {
|
137 |
+
End2EndBenchmark(state, [](pthreadpool_t threadpool) {
|
138 |
+
return models::FP16SparseMobileNetV3Small(0.8f, threadpool);
|
139 |
+
});
|
140 |
+
}
|
141 |
+
|
142 |
+
static void QC8MobileNetV1(benchmark::State& state) {
|
143 |
+
End2EndBenchmark(state, models::QC8MobileNetV1);
|
144 |
+
}
|
145 |
+
|
146 |
+
static void QC8MobileNetV2(benchmark::State& state) {
|
147 |
+
End2EndBenchmark(state, models::QC8MobileNetV2);
|
148 |
+
}
|
149 |
+
|
150 |
+
static void QS8MobileNetV1(benchmark::State& state) {
|
151 |
+
End2EndBenchmark(state, models::QS8MobileNetV1);
|
152 |
+
}
|
153 |
+
|
154 |
+
static void QS8MobileNetV2(benchmark::State& state) {
|
155 |
+
End2EndBenchmark(state, models::QS8MobileNetV2);
|
156 |
+
}
|
157 |
+
|
158 |
+
static void QU8MobileNetV1(benchmark::State& state) {
|
159 |
+
End2EndBenchmark(state, models::QU8MobileNetV1);
|
160 |
+
}
|
161 |
+
|
162 |
+
static void QU8MobileNetV2(benchmark::State& state) {
|
163 |
+
End2EndBenchmark(state, models::QU8MobileNetV2);
|
164 |
+
}
|
165 |
+
|
166 |
+
BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
167 |
+
BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
168 |
+
BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
169 |
+
BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
170 |
+
|
171 |
+
BENCHMARK(FP32Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
172 |
+
BENCHMARK(FP32Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
173 |
+
BENCHMARK(FP32Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
174 |
+
BENCHMARK(FP32Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
175 |
+
|
176 |
+
BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
177 |
+
BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
178 |
+
BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
179 |
+
BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
180 |
+
|
181 |
+
BENCHMARK(FP16Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
182 |
+
BENCHMARK(FP16Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
183 |
+
BENCHMARK(FP16Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
184 |
+
BENCHMARK(FP16Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
185 |
+
|
186 |
+
BENCHMARK(QC8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
187 |
+
BENCHMARK(QC8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
188 |
+
|
189 |
+
BENCHMARK(QS8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
190 |
+
BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
191 |
+
|
192 |
+
BENCHMARK(QU8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
193 |
+
BENCHMARK(QU8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
194 |
+
|
195 |
+
#if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
|
196 |
+
BENCHMARK(FP32MobileNetV3SmallFused)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
197 |
+
#endif // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
|
198 |
+
|
199 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
200 |
+
BENCHMARK_MAIN();
|
201 |
+
#endif
|
bench/end2end.h
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#pragma once
|
7 |
+
|
8 |
+
#include <benchmark/benchmark.h>
|
9 |
+
|
10 |
+
#include <xnnpack/models.h>
|
11 |
+
|
12 |
+
|
13 |
+
#define BENCHMARK_FP16_END2END(benchmark_fn) \
|
14 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP16MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
15 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP16MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
16 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP16MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
17 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP16MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
18 |
+
|
19 |
+
#define BENCHMARK_FP32_END2END(benchmark_fn) \
|
20 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
21 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
22 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
23 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
24 |
+
|
25 |
+
#define BENCHMARK_FP32_END2END_JIT(benchmark_fn) \
|
26 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
27 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
28 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3LargeJit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
29 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3SmallJit)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
30 |
+
|
31 |
+
#define BENCHMARK_QS8_END2END(benchmark_fn) \
|
32 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QS8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
33 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
34 |
+
|
35 |
+
#define BENCHMARK_QU8_END2END(benchmark_fn) \
|
36 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QU8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
|
37 |
+
BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QU8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
|
bench/f16-conv-hwc2chw.cc
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/dconv.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/conv.h>
|
22 |
+
#include <xnnpack/microfnptr.h>
|
23 |
+
#include <xnnpack/microparams-init.h>
|
24 |
+
#include <xnnpack/pack.h>
|
25 |
+
|
26 |
+
|
27 |
+
static void f16_conv_hwc2chw(benchmark::State& state,
|
28 |
+
xnn_f16_conv_hwc2chw_ukernel_fn conv,
|
29 |
+
uint32_t output_channels_tile,
|
30 |
+
xnn_init_f16_minmax_params_fn init_params,
|
31 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
32 |
+
{
|
33 |
+
if ((isa_check != nullptr) && !isa_check(state)) {
|
34 |
+
return;
|
35 |
+
}
|
36 |
+
const size_t input_height = state.range(0);
|
37 |
+
const size_t input_width = state.range(1);
|
38 |
+
const size_t output_channels = state.range(2);
|
39 |
+
|
40 |
+
std::random_device random_device;
|
41 |
+
auto rng = std::mt19937(random_device());
|
42 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
43 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
44 |
+
|
45 |
+
const size_t input_channels = 3;
|
46 |
+
const size_t kernel_size = 3;
|
47 |
+
const size_t padding = 1;
|
48 |
+
const size_t subsampling = 2;
|
49 |
+
|
50 |
+
const size_t output_height = (input_height + 2 * padding - kernel_size) / subsampling + 1;
|
51 |
+
const size_t output_width = (input_width + 2 * padding - kernel_size) / subsampling + 1;
|
52 |
+
|
53 |
+
std::vector<uint16_t> input(input_height * input_width * input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
54 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
55 |
+
std::vector<uint16_t> kernel(output_channels * kernel_size * kernel_size * input_channels);
|
56 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
|
57 |
+
std::vector<uint16_t> bias(output_channels);
|
58 |
+
std::generate(bias.begin(), bias.end(), std::ref(f16rng));
|
59 |
+
|
60 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> zero(input_channels * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
61 |
+
|
62 |
+
const size_t weights_elements = (kernel_size * kernel_size * input_channels + 1) *
|
63 |
+
benchmark::utils::RoundUp<size_t>(output_channels, output_channels_tile);
|
64 |
+
const size_t output_elements = output_height * output_width * output_channels;
|
65 |
+
const size_t num_buffers = 1 +
|
66 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
67 |
+
sizeof(uint16_t) * (weights_elements + output_elements));
|
68 |
+
|
69 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(weights_elements * num_buffers);
|
70 |
+
std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
|
71 |
+
xnn_pack_f16_dconv_oki_w(
|
72 |
+
output_channels, input_channels, output_channels_tile,
|
73 |
+
kernel_size /* kernel height */, kernel_size /* kernel width */,
|
74 |
+
kernel.data(), bias.data(), packed_weights.data(), nullptr);
|
75 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
76 |
+
std::copy(packed_weights.cbegin(),
|
77 |
+
packed_weights.cbegin() + weights_elements,
|
78 |
+
packed_weights.begin() + n * weights_elements);
|
79 |
+
}
|
80 |
+
|
81 |
+
std::vector<uint16_t> output(output_elements * num_buffers);
|
82 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
83 |
+
|
84 |
+
xnn_f16_minmax_params params;
|
85 |
+
init_params(¶ms, 0x7C00 /* inf */, 0xFC00 /* -inf */);
|
86 |
+
|
87 |
+
size_t buffer_index = 0;
|
88 |
+
for (auto _ : state) {
|
89 |
+
state.PauseTiming();
|
90 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
|
91 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
92 |
+
state.ResumeTiming();
|
93 |
+
|
94 |
+
conv(
|
95 |
+
input_height, input_width,
|
96 |
+
0 /* output_y_start */, output_height /* output_y_end */,
|
97 |
+
input.data(), zero.data(),
|
98 |
+
packed_weights.data() + buffer_index * weights_elements,
|
99 |
+
output.data() + buffer_index * output_elements,
|
100 |
+
padding, output_channels,
|
101 |
+
output_channels * output_width * sizeof(uint16_t),
|
102 |
+
output_channels * sizeof(uint16_t),
|
103 |
+
¶ms);
|
104 |
+
}
|
105 |
+
|
106 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
107 |
+
if (cpu_frequency != 0) {
|
108 |
+
state.counters["cpufreq"] = cpu_frequency;
|
109 |
+
}
|
110 |
+
|
111 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
112 |
+
uint64_t(state.iterations()) * 2 *
|
113 |
+
output_height * output_width *
|
114 |
+
input_channels * output_channels *
|
115 |
+
kernel_size * kernel_size,
|
116 |
+
benchmark::Counter::kIsRate);
|
117 |
+
}
|
118 |
+
|
119 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
120 |
+
static void f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2(benchmark::State& state, const char* net) {
|
121 |
+
f16_conv_hwc2chw(state, xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2, 4,
|
122 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
123 |
+
}
|
124 |
+
|
125 |
+
BENCHMARK_DCONV(f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2);
|
126 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
127 |
+
|
128 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
129 |
+
BENCHMARK_MAIN();
|
130 |
+
#endif
|
bench/f16-dwconv-e2e.cc
ADDED
@@ -0,0 +1,736 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstring>
|
9 |
+
#include <functional>
|
10 |
+
#include <memory>
|
11 |
+
#include <random>
|
12 |
+
#include <vector>
|
13 |
+
|
14 |
+
#include "bench/end2end.h"
|
15 |
+
#include "bench/utils.h"
|
16 |
+
#include <benchmark/benchmark.h>
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/config.h>
|
20 |
+
#include <xnnpack/dwconv.h>
|
21 |
+
#include <xnnpack/microfnptr.h>
|
22 |
+
#include <xnnpack/microparams-init.h>
|
23 |
+
#include <xnnpack/models.h>
|
24 |
+
|
25 |
+
|
26 |
+
static void DWConvEnd2EndBenchmark(
|
27 |
+
benchmark::State& state,
|
28 |
+
models::ExecutionPlanFactory model_factory,
|
29 |
+
xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv_minmax,
|
30 |
+
xnn_init_f16_minmax_params_fn init_params,
|
31 |
+
uint8_t channel_tile, uint8_t primary_tile,
|
32 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
33 |
+
{
|
34 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
35 |
+
return;
|
36 |
+
}
|
37 |
+
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
|
38 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
39 |
+
return;
|
40 |
+
}
|
41 |
+
|
42 |
+
struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
|
43 |
+
if (dwconv_config == nullptr) {
|
44 |
+
state.SkipWithError("hardware does not support F16 DWCONV");
|
45 |
+
return;
|
46 |
+
}
|
47 |
+
|
48 |
+
// Save dwconv_config so that we can modify it for the benchmark and later restore it.
|
49 |
+
struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
|
50 |
+
memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
|
51 |
+
|
52 |
+
// Override microkernels chosen in xnn_initialize
|
53 |
+
for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
|
54 |
+
// Replace only the microkernel with the matching kernel size.
|
55 |
+
if (dwconv_config[i].primary_tile == primary_tile) {
|
56 |
+
std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
|
57 |
+
|
58 |
+
// Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
|
59 |
+
dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv_minmax);
|
60 |
+
dwconv_config[i].channel_tile = channel_tile;
|
61 |
+
dwconv_config[i].channel_subtile = channel_tile;
|
62 |
+
dwconv_config[i].channel_round = 1;
|
63 |
+
dwconv_config[i].primary_tile = primary_tile;
|
64 |
+
dwconv_config[i].init.f16 = init_params;
|
65 |
+
break;
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
auto execution_plan = model_factory(nullptr);
|
70 |
+
if (execution_plan.empty()) {
|
71 |
+
state.SkipWithError("failed to create a model");
|
72 |
+
return;
|
73 |
+
}
|
74 |
+
|
75 |
+
for (auto _ : state) {
|
76 |
+
for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
|
77 |
+
xnn_status status = xnn_run_operator(op.get(), nullptr);
|
78 |
+
if (status != xnn_status_success) {
|
79 |
+
state.SkipWithError("failed to run a model");
|
80 |
+
return;
|
81 |
+
}
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
86 |
+
if (cpu_frequency != 0) {
|
87 |
+
state.counters["cpufreq"] = cpu_frequency;
|
88 |
+
}
|
89 |
+
|
90 |
+
// Restore dwconv_config to original state as defined in init.c.
|
91 |
+
memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
|
92 |
+
}
|
93 |
+
|
94 |
+
static void DWConvEnd2EndBenchmark(
|
95 |
+
benchmark::State& state,
|
96 |
+
models::ExecutionPlanFactory model_factory,
|
97 |
+
xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv_minmax,
|
98 |
+
xnn_init_f16_minmax_params_fn init_params,
|
99 |
+
uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round,
|
100 |
+
uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile,
|
101 |
+
uint8_t primary_tile_to_replace,
|
102 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
103 |
+
{
|
104 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
105 |
+
return;
|
106 |
+
}
|
107 |
+
|
108 |
+
struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
|
109 |
+
if (dwconv_config == nullptr) {
|
110 |
+
state.SkipWithError("failed to initialize f16 DWCONV config");
|
111 |
+
return;
|
112 |
+
}
|
113 |
+
|
114 |
+
// Save dwconv_convig so that we can modify it for the benchmark and later restore it.
|
115 |
+
struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
|
116 |
+
memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
|
117 |
+
|
118 |
+
bool found = false;
|
119 |
+
for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
|
120 |
+
if (dwconv_config[i].primary_tile == primary_tile_to_replace) {
|
121 |
+
found = true;
|
122 |
+
} else if (dwconv_config[i].last_tile != 0) {
|
123 |
+
// Found a multipass microkernel, replace it.
|
124 |
+
found = true;
|
125 |
+
}
|
126 |
+
}
|
127 |
+
|
128 |
+
if (!found) {
|
129 |
+
state.SkipWithError("can't replace with multipass");
|
130 |
+
return;
|
131 |
+
}
|
132 |
+
|
133 |
+
// Override microkernels chosen in xnn_initialize
|
134 |
+
for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
|
135 |
+
// Replace only the microkernel with the matching kernel size.
|
136 |
+
if (dwconv_config[i].primary_tile == primary_tile_to_replace ||
|
137 |
+
dwconv_config[i].last_tile != 0) {
|
138 |
+
// Replace either when the primary_tile_to_replace matches, or replace the
|
139 |
+
// first multipass dwconv microkernel we find.
|
140 |
+
// TODO(zhin): support specifying target multipass dwconv to replace.
|
141 |
+
std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
|
142 |
+
|
143 |
+
// Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
|
144 |
+
dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv_minmax);
|
145 |
+
dwconv_config[i].channel_tile = channel_tile;
|
146 |
+
dwconv_config[i].channel_subtile = channel_subtile;
|
147 |
+
dwconv_config[i].channel_round = channel_round;
|
148 |
+
dwconv_config[i].primary_tile = primary_tile;
|
149 |
+
dwconv_config[i].middle_tile = middle_tile;
|
150 |
+
dwconv_config[i].last_tile = last_tile;
|
151 |
+
dwconv_config[i].init.f16 = init_params;
|
152 |
+
break;
|
153 |
+
}
|
154 |
+
}
|
155 |
+
|
156 |
+
auto execution_plan = model_factory(nullptr);
|
157 |
+
if (execution_plan.empty()) {
|
158 |
+
state.SkipWithError("failed to create a model");
|
159 |
+
return;
|
160 |
+
}
|
161 |
+
|
162 |
+
for (auto _ : state) {
|
163 |
+
for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
|
164 |
+
xnn_status status = xnn_run_operator(op.get(), nullptr);
|
165 |
+
if (status != xnn_status_success) {
|
166 |
+
state.SkipWithError("failed to run a model");
|
167 |
+
return;
|
168 |
+
}
|
169 |
+
}
|
170 |
+
}
|
171 |
+
|
172 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
173 |
+
if (cpu_frequency != 0) {
|
174 |
+
state.counters["cpufreq"] = cpu_frequency;
|
175 |
+
}
|
176 |
+
|
177 |
+
memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
|
178 |
+
}
|
179 |
+
|
180 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
181 |
+
static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
182 |
+
DWConvEnd2EndBenchmark(
|
183 |
+
state, model,
|
184 |
+
xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
185 |
+
/*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
186 |
+
}
|
187 |
+
static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
188 |
+
DWConvEnd2EndBenchmark(
|
189 |
+
state, model,
|
190 |
+
xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
191 |
+
/*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
192 |
+
}
|
193 |
+
static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
194 |
+
DWConvEnd2EndBenchmark(
|
195 |
+
state, model,
|
196 |
+
xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
197 |
+
/*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
198 |
+
}
|
199 |
+
static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
200 |
+
DWConvEnd2EndBenchmark(
|
201 |
+
state, model,
|
202 |
+
xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
203 |
+
/*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
204 |
+
}
|
205 |
+
static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
206 |
+
DWConvEnd2EndBenchmark(
|
207 |
+
state, model,
|
208 |
+
xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
209 |
+
/*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
210 |
+
}
|
211 |
+
static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
212 |
+
DWConvEnd2EndBenchmark(
|
213 |
+
state, model,
|
214 |
+
xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
215 |
+
/*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
216 |
+
}
|
217 |
+
|
218 |
+
static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
219 |
+
DWConvEnd2EndBenchmark(
|
220 |
+
state, model,
|
221 |
+
xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
222 |
+
/*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
223 |
+
}
|
224 |
+
static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
225 |
+
DWConvEnd2EndBenchmark(
|
226 |
+
state, model,
|
227 |
+
xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
228 |
+
/*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
229 |
+
}
|
230 |
+
static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
231 |
+
DWConvEnd2EndBenchmark(
|
232 |
+
state, model,
|
233 |
+
xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
234 |
+
/*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
235 |
+
}
|
236 |
+
static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
237 |
+
DWConvEnd2EndBenchmark(
|
238 |
+
state, model,
|
239 |
+
xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
240 |
+
/*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
241 |
+
}
|
242 |
+
static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
243 |
+
DWConvEnd2EndBenchmark(
|
244 |
+
state, model,
|
245 |
+
xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
246 |
+
/*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
247 |
+
}
|
248 |
+
static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
249 |
+
DWConvEnd2EndBenchmark(
|
250 |
+
state, model,
|
251 |
+
xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
252 |
+
/*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
253 |
+
}
|
254 |
+
|
255 |
+
static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
256 |
+
DWConvEnd2EndBenchmark(
|
257 |
+
state, model,
|
258 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
259 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
260 |
+
}
|
261 |
+
static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
262 |
+
DWConvEnd2EndBenchmark(
|
263 |
+
state, model,
|
264 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
265 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
266 |
+
}
|
267 |
+
static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
268 |
+
DWConvEnd2EndBenchmark(
|
269 |
+
state, model,
|
270 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
271 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
272 |
+
}
|
273 |
+
static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
274 |
+
DWConvEnd2EndBenchmark(
|
275 |
+
state, model,
|
276 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
277 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
278 |
+
}
|
279 |
+
static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
280 |
+
DWConvEnd2EndBenchmark(
|
281 |
+
state, model,
|
282 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
283 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
284 |
+
}
|
285 |
+
static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
286 |
+
DWConvEnd2EndBenchmark(
|
287 |
+
state, model,
|
288 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
289 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
290 |
+
}
|
291 |
+
|
292 |
+
static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
293 |
+
DWConvEnd2EndBenchmark(
|
294 |
+
state, model,
|
295 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
296 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
297 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
298 |
+
/*primary_tile_to_replace=*/25,
|
299 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
300 |
+
}
|
301 |
+
static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
302 |
+
DWConvEnd2EndBenchmark(
|
303 |
+
state, model,
|
304 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
305 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
306 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
307 |
+
/*primary_tile_to_replace=*/25,
|
308 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
309 |
+
}
|
310 |
+
static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
311 |
+
DWConvEnd2EndBenchmark(
|
312 |
+
state, model,
|
313 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
314 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
315 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
316 |
+
/*primary_tile_to_replace=*/25,
|
317 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
318 |
+
}
|
319 |
+
static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
320 |
+
DWConvEnd2EndBenchmark(
|
321 |
+
state, model,
|
322 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
323 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
324 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
325 |
+
/*primary_tile_to_replace=*/25,
|
326 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
327 |
+
}
|
328 |
+
static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
329 |
+
DWConvEnd2EndBenchmark(
|
330 |
+
state, model,
|
331 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
332 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
333 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
334 |
+
/*primary_tile_to_replace=*/25,
|
335 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
336 |
+
}
|
337 |
+
static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
338 |
+
DWConvEnd2EndBenchmark(
|
339 |
+
state, model,
|
340 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
341 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
342 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
343 |
+
/*primary_tile_to_replace=*/25,
|
344 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
345 |
+
}
|
346 |
+
|
347 |
+
static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
348 |
+
DWConvEnd2EndBenchmark(
|
349 |
+
state, model,
|
350 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
351 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
352 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
353 |
+
/*primary_tile_to_replace=*/25,
|
354 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
355 |
+
}
|
356 |
+
static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
357 |
+
DWConvEnd2EndBenchmark(
|
358 |
+
state, model,
|
359 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
360 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
361 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
362 |
+
/*primary_tile_to_replace=*/25,
|
363 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
364 |
+
}
|
365 |
+
static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
366 |
+
DWConvEnd2EndBenchmark(
|
367 |
+
state, model,
|
368 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
369 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
370 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
371 |
+
/*primary_tile_to_replace=*/25,
|
372 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
373 |
+
}
|
374 |
+
static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
375 |
+
DWConvEnd2EndBenchmark(
|
376 |
+
state, model,
|
377 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
378 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
379 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
380 |
+
/*primary_tile_to_replace=*/25,
|
381 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
382 |
+
}
|
383 |
+
static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
384 |
+
DWConvEnd2EndBenchmark(
|
385 |
+
state, model,
|
386 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
387 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
388 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
389 |
+
/*primary_tile_to_replace=*/25,
|
390 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
391 |
+
}
|
392 |
+
static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
393 |
+
DWConvEnd2EndBenchmark(
|
394 |
+
state, model,
|
395 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
396 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
397 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
398 |
+
/*primary_tile_to_replace=*/25,
|
399 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
400 |
+
}
|
401 |
+
|
402 |
+
static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
403 |
+
DWConvEnd2EndBenchmark(
|
404 |
+
state, model,
|
405 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
406 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
407 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
408 |
+
/*primary_tile_to_replace=*/25,
|
409 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
410 |
+
}
|
411 |
+
static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
412 |
+
DWConvEnd2EndBenchmark(
|
413 |
+
state, model,
|
414 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
415 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
416 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
417 |
+
/*primary_tile_to_replace=*/25,
|
418 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
419 |
+
}
|
420 |
+
static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
421 |
+
DWConvEnd2EndBenchmark(
|
422 |
+
state, model,
|
423 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
424 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
425 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
426 |
+
/*primary_tile_to_replace=*/25,
|
427 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
428 |
+
}
|
429 |
+
static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
430 |
+
DWConvEnd2EndBenchmark(
|
431 |
+
state, model,
|
432 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
433 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
434 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
435 |
+
/*primary_tile_to_replace=*/25,
|
436 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
437 |
+
}
|
438 |
+
static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
|
439 |
+
DWConvEnd2EndBenchmark(
|
440 |
+
state, model,
|
441 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
442 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
443 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
444 |
+
/*primary_tile_to_replace=*/25,
|
445 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
446 |
+
}
|
447 |
+
static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
448 |
+
DWConvEnd2EndBenchmark(
|
449 |
+
state, model,
|
450 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
451 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
452 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
453 |
+
/*primary_tile_to_replace=*/25,
|
454 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
455 |
+
}
|
456 |
+
|
457 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith);
|
458 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith_acc2);
|
459 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith);
|
460 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith_acc2);
|
461 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith);
|
462 |
+
BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith_acc2);
|
463 |
+
|
464 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith);
|
465 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith_acc2);
|
466 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith);
|
467 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith_acc2);
|
468 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith);
|
469 |
+
BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith_acc2);
|
470 |
+
|
471 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith);
|
472 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith_acc2);
|
473 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith);
|
474 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith_acc2);
|
475 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith);
|
476 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith_acc2);
|
477 |
+
|
478 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
|
479 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
|
480 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
|
481 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
|
482 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
|
483 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
|
484 |
+
|
485 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
|
486 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
|
487 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
|
488 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
|
489 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
|
490 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
|
491 |
+
|
492 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
|
493 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
|
494 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
|
495 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
|
496 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
|
497 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
|
498 |
+
|
499 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
500 |
+
|
501 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
502 |
+
static void f16_dwconv_25p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
503 |
+
DWConvEnd2EndBenchmark(
|
504 |
+
state, model,
|
505 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
|
506 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
507 |
+
}
|
508 |
+
static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
509 |
+
DWConvEnd2EndBenchmark(
|
510 |
+
state, model,
|
511 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
512 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
513 |
+
}
|
514 |
+
static void f16_dwconv_25p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
515 |
+
DWConvEnd2EndBenchmark(
|
516 |
+
state, model,
|
517 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
|
518 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
519 |
+
}
|
520 |
+
static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
521 |
+
DWConvEnd2EndBenchmark(
|
522 |
+
state, model,
|
523 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
524 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
525 |
+
}
|
526 |
+
static void f16_dwconv_25p32c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
527 |
+
DWConvEnd2EndBenchmark(
|
528 |
+
state, model,
|
529 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
|
530 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
531 |
+
}
|
532 |
+
static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
533 |
+
DWConvEnd2EndBenchmark(
|
534 |
+
state, model,
|
535 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
536 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
537 |
+
}
|
538 |
+
|
539 |
+
static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
540 |
+
DWConvEnd2EndBenchmark(
|
541 |
+
state, model,
|
542 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
543 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
544 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
545 |
+
/*primary_tile_to_replace=*/25,
|
546 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
547 |
+
}
|
548 |
+
static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
549 |
+
DWConvEnd2EndBenchmark(
|
550 |
+
state, model,
|
551 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
552 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
553 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
554 |
+
/*primary_tile_to_replace=*/25,
|
555 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
556 |
+
}
|
557 |
+
static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
558 |
+
DWConvEnd2EndBenchmark(
|
559 |
+
state, model,
|
560 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
561 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
562 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
563 |
+
/*primary_tile_to_replace=*/25,
|
564 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
565 |
+
}
|
566 |
+
static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
567 |
+
DWConvEnd2EndBenchmark(
|
568 |
+
state, model,
|
569 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
570 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
571 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
572 |
+
/*primary_tile_to_replace=*/25,
|
573 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
574 |
+
}
|
575 |
+
static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
576 |
+
DWConvEnd2EndBenchmark(
|
577 |
+
state, model,
|
578 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
579 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
580 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
581 |
+
/*primary_tile_to_replace=*/25,
|
582 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
583 |
+
}
|
584 |
+
static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
585 |
+
DWConvEnd2EndBenchmark(
|
586 |
+
state, model,
|
587 |
+
xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
588 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
589 |
+
/*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
|
590 |
+
/*primary_tile_to_replace=*/25,
|
591 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
592 |
+
}
|
593 |
+
|
594 |
+
static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
595 |
+
DWConvEnd2EndBenchmark(
|
596 |
+
state, model,
|
597 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
598 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
599 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
600 |
+
/*primary_tile_to_replace=*/25,
|
601 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
602 |
+
}
|
603 |
+
static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
604 |
+
DWConvEnd2EndBenchmark(
|
605 |
+
state, model,
|
606 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
607 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
608 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
609 |
+
/*primary_tile_to_replace=*/25,
|
610 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
611 |
+
}
|
612 |
+
static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
613 |
+
DWConvEnd2EndBenchmark(
|
614 |
+
state, model,
|
615 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
616 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
617 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
618 |
+
/*primary_tile_to_replace=*/25,
|
619 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
620 |
+
}
|
621 |
+
static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
622 |
+
DWConvEnd2EndBenchmark(
|
623 |
+
state, model,
|
624 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
625 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
626 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
627 |
+
/*primary_tile_to_replace=*/25,
|
628 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
629 |
+
}
|
630 |
+
static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
631 |
+
DWConvEnd2EndBenchmark(
|
632 |
+
state, model,
|
633 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
634 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
635 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
636 |
+
/*primary_tile_to_replace=*/25,
|
637 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
638 |
+
}
|
639 |
+
static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
640 |
+
DWConvEnd2EndBenchmark(
|
641 |
+
state, model,
|
642 |
+
xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
643 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
644 |
+
/*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
|
645 |
+
/*primary_tile_to_replace=*/25,
|
646 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
647 |
+
}
|
648 |
+
|
649 |
+
static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
650 |
+
DWConvEnd2EndBenchmark(
|
651 |
+
state, model,
|
652 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
653 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
654 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
655 |
+
/*primary_tile_to_replace=*/25,
|
656 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
657 |
+
}
|
658 |
+
static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
659 |
+
DWConvEnd2EndBenchmark(
|
660 |
+
state, model,
|
661 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
662 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
663 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
664 |
+
/*primary_tile_to_replace=*/25,
|
665 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
666 |
+
}
|
667 |
+
static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
668 |
+
DWConvEnd2EndBenchmark(
|
669 |
+
state, model,
|
670 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
671 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
672 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
673 |
+
/*primary_tile_to_replace=*/25,
|
674 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
675 |
+
}
|
676 |
+
static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
677 |
+
DWConvEnd2EndBenchmark(
|
678 |
+
state, model,
|
679 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
680 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
681 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
682 |
+
/*primary_tile_to_replace=*/25,
|
683 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
684 |
+
}
|
685 |
+
static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
|
686 |
+
DWConvEnd2EndBenchmark(
|
687 |
+
state, model,
|
688 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
689 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
690 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
691 |
+
/*primary_tile_to_replace=*/25,
|
692 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
693 |
+
}
|
694 |
+
static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
|
695 |
+
DWConvEnd2EndBenchmark(
|
696 |
+
state, model,
|
697 |
+
xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
698 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
699 |
+
/*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
|
700 |
+
/*primary_tile_to_replace=*/25,
|
701 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
702 |
+
}
|
703 |
+
|
704 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3)
|
705 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3_acc2)
|
706 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3)
|
707 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3_acc2)
|
708 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3)
|
709 |
+
BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3_acc2)
|
710 |
+
|
711 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3)
|
712 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
|
713 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3)
|
714 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
|
715 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3)
|
716 |
+
BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
|
717 |
+
|
718 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3)
|
719 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
|
720 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3)
|
721 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
|
722 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3)
|
723 |
+
BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
|
724 |
+
|
725 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3)
|
726 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
|
727 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3)
|
728 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
|
729 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3)
|
730 |
+
BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
|
731 |
+
|
732 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
733 |
+
|
734 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
735 |
+
BENCHMARK_MAIN();
|
736 |
+
#endif
|
bench/f16-dwconv.cc
ADDED
@@ -0,0 +1,795 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/dwconv.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/dwconv.h>
|
22 |
+
#include <xnnpack/indirection.h>
|
23 |
+
#include <xnnpack/microfnptr.h>
|
24 |
+
#include <xnnpack/microkernel-utils.h>
|
25 |
+
#include <xnnpack/microparams-init.h>
|
26 |
+
#include <xnnpack/operator.h>
|
27 |
+
#include <xnnpack/pack.h>
|
28 |
+
|
29 |
+
|
30 |
+
static void f16_dwconv(benchmark::State& state,
|
31 |
+
xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv,
|
32 |
+
xnn_init_f16_minmax_params_fn init_params,
|
33 |
+
uint32_t channel_tile, uint32_t primary_tile,
|
34 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
35 |
+
{
|
36 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
37 |
+
return;
|
38 |
+
}
|
39 |
+
|
40 |
+
const size_t input_height = state.range(0);
|
41 |
+
const size_t input_width = state.range(1);
|
42 |
+
const size_t kernel_height = state.range(2);
|
43 |
+
const size_t kernel_width = state.range(3);
|
44 |
+
const size_t padding_height = state.range(4);
|
45 |
+
const size_t padding_width = state.range(5);
|
46 |
+
const size_t subsampling = state.range(6);
|
47 |
+
const size_t dilation = state.range(7);
|
48 |
+
const size_t channels = state.range(8);
|
49 |
+
|
50 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
51 |
+
if (kernel_size > primary_tile) {
|
52 |
+
state.SkipWithError("kernel size mismatch");
|
53 |
+
return;
|
54 |
+
}
|
55 |
+
|
56 |
+
std::random_device random_device;
|
57 |
+
auto rng = std::mt19937(random_device());
|
58 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
59 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
60 |
+
|
61 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
62 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
63 |
+
const size_t padding_left = padding_width / 2;
|
64 |
+
const size_t padding_top = padding_height / 2;
|
65 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
66 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
67 |
+
const size_t output_size = output_height * output_width;
|
68 |
+
const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
|
69 |
+
const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
|
70 |
+
|
71 |
+
const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
|
72 |
+
|
73 |
+
std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
74 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
75 |
+
std::vector<uint16_t> k(channels * kernel_height * kernel_width);
|
76 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
77 |
+
std::vector<uint16_t> b(channels);
|
78 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
79 |
+
|
80 |
+
std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
81 |
+
|
82 |
+
const size_t w_elements = (kernel_size + 1) * c_stride;
|
83 |
+
// Can read (primary_tile - kernel_size) elements after end of indirection buffer.
|
84 |
+
const size_t i_elements = (primary_tile - kernel_size) + output_height * step_height;
|
85 |
+
const size_t c_elements = output_size * channels;
|
86 |
+
const size_t num_buffers = 1 +
|
87 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
88 |
+
sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
|
89 |
+
|
90 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
91 |
+
std::fill(w.begin(), w.end(), UINT16_C(0));
|
92 |
+
xnn_pack_f16_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels,
|
93 |
+
channel_tile, channel_tile, /*channel_round=*/1,
|
94 |
+
k.data(), b.data(), w.data(),
|
95 |
+
/*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
|
96 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
97 |
+
std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
|
98 |
+
}
|
99 |
+
|
100 |
+
std::vector<const uint16_t*> i(i_elements * num_buffers);
|
101 |
+
xnn_operator convolution_op = { };
|
102 |
+
convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
|
103 |
+
convolution_op.input = a.data();
|
104 |
+
convolution_op.input_pixel_stride = channels;
|
105 |
+
convolution_op.zero_buffer = z.data();
|
106 |
+
convolution_op.input_height = input_height;
|
107 |
+
convolution_op.input_width = input_width;
|
108 |
+
convolution_op.output_height = output_height;
|
109 |
+
convolution_op.output_width = output_width;
|
110 |
+
convolution_op.kernel_height = kernel_height;
|
111 |
+
convolution_op.kernel_width = kernel_width;
|
112 |
+
convolution_op.stride_height = subsampling;
|
113 |
+
convolution_op.stride_width = subsampling;
|
114 |
+
convolution_op.dilation_height = dilation;
|
115 |
+
convolution_op.dilation_width = dilation;
|
116 |
+
convolution_op.padding_top = padding_top;
|
117 |
+
convolution_op.padding_left = padding_left;
|
118 |
+
|
119 |
+
xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, primary_tile, XNN_LOG2_SIZEOF_HALF);
|
120 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
121 |
+
std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
|
122 |
+
}
|
123 |
+
|
124 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
125 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
126 |
+
|
127 |
+
xnn_f16_minmax_params params;
|
128 |
+
init_params(¶ms, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
129 |
+
|
130 |
+
size_t buffer_index = 0;
|
131 |
+
for (auto _ : state) {
|
132 |
+
state.PauseTiming();
|
133 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
134 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
135 |
+
state.ResumeTiming();
|
136 |
+
|
137 |
+
for (size_t y = 0; y < output_height; y++) {
|
138 |
+
dwconv(channels, output_width,
|
139 |
+
reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
|
140 |
+
w.data() + buffer_index * w_elements,
|
141 |
+
c.data() + buffer_index * c_elements + y * output_width * channels,
|
142 |
+
kernel_height * step_width * sizeof(void*), 0,
|
143 |
+
0, z.data(), ¶ms);
|
144 |
+
}
|
145 |
+
}
|
146 |
+
|
147 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
148 |
+
if (cpu_frequency != 0) {
|
149 |
+
state.counters["cpufreq"] = cpu_frequency;
|
150 |
+
}
|
151 |
+
|
152 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
153 |
+
uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
|
154 |
+
|
155 |
+
state.counters["bytes"] = benchmark::Counter(
|
156 |
+
uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
|
157 |
+
benchmark::Counter::kIsRate);
|
158 |
+
}
|
159 |
+
|
160 |
+
static void f16_dwconv(benchmark::State& state,
|
161 |
+
xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv,
|
162 |
+
xnn_init_f16_minmax_params_fn init_params,
|
163 |
+
uint32_t first_pass_tile,
|
164 |
+
uint32_t middle_pass_tile,
|
165 |
+
uint32_t last_pass_tile,
|
166 |
+
uint32_t channel_tile,
|
167 |
+
uint32_t channel_subtile,
|
168 |
+
uint32_t channel_round,
|
169 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
170 |
+
{
|
171 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
172 |
+
return;
|
173 |
+
}
|
174 |
+
|
175 |
+
const size_t input_height = state.range(0);
|
176 |
+
const size_t input_width = state.range(1);
|
177 |
+
const size_t kernel_height = state.range(2);
|
178 |
+
const size_t kernel_width = state.range(3);
|
179 |
+
const size_t padding_height = state.range(4);
|
180 |
+
const size_t padding_width = state.range(5);
|
181 |
+
const size_t subsampling = state.range(6);
|
182 |
+
const size_t dilation = state.range(7);
|
183 |
+
const size_t channels = state.range(8);
|
184 |
+
|
185 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
186 |
+
|
187 |
+
if (kernel_size <= first_pass_tile) {
|
188 |
+
state.SkipWithError("kernel size mismatch");
|
189 |
+
return;
|
190 |
+
}
|
191 |
+
|
192 |
+
std::random_device random_device;
|
193 |
+
auto rng = std::mt19937(random_device());
|
194 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
195 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
196 |
+
|
197 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
198 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
199 |
+
const size_t padding_left = padding_width / 2;
|
200 |
+
const size_t padding_top = padding_height / 2;
|
201 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
202 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
203 |
+
const size_t output_size = output_height * output_width;
|
204 |
+
const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
|
205 |
+
const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
|
206 |
+
|
207 |
+
std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
208 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
209 |
+
std::vector<uint16_t> k(channels * kernel_size);
|
210 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
211 |
+
std::vector<uint16_t> b(channels);
|
212 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
213 |
+
|
214 |
+
std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
215 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(channels + XNN_MULTIPASS_EXTRA_BYTES / sizeof(uint16_t));
|
216 |
+
|
217 |
+
const size_t tile_size = xnn_dwconv_multipass_tile_size(
|
218 |
+
kernel_size, first_pass_tile, middle_pass_tile, last_pass_tile);
|
219 |
+
const size_t w_elements =
|
220 |
+
xnn_dwconv_multipass_weights_size(
|
221 |
+
tile_size, channels, channel_tile, channel_subtile, channel_round, /*bias_element_size=*/sizeof(uint16_t),
|
222 |
+
/*log2_filter_element_size=*/1, /*extra_weights_byte=*/0) /
|
223 |
+
sizeof(uint16_t);
|
224 |
+
// Can read (primary_tile - kernel_size) elements after end of indirection buffer.
|
225 |
+
const size_t i_elements = tile_size - kernel_size + output_height * step_height;
|
226 |
+
const size_t c_elements = output_size * channels;
|
227 |
+
const size_t num_buffers = 1 +
|
228 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
229 |
+
sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
|
230 |
+
|
231 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
232 |
+
std::fill(w.begin(), w.end(), UINT16_C(0));
|
233 |
+
xnn_pack_f16_dwconv_ghw_w(
|
234 |
+
first_pass_tile, middle_pass_tile, last_pass_tile,
|
235 |
+
kernel_height, kernel_width,
|
236 |
+
channels, channel_tile, channel_subtile, channel_round,
|
237 |
+
k.data(), b.data(), w.data(), /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
|
238 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
239 |
+
std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
|
240 |
+
}
|
241 |
+
|
242 |
+
std::vector<const uint16_t*> i(i_elements * num_buffers);
|
243 |
+
xnn_operator convolution_op = { };
|
244 |
+
convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
|
245 |
+
convolution_op.input = a.data();
|
246 |
+
convolution_op.input_pixel_stride = channels;
|
247 |
+
convolution_op.zero_buffer = z.data();
|
248 |
+
convolution_op.input_height = input_height;
|
249 |
+
convolution_op.input_width = input_width;
|
250 |
+
convolution_op.output_height = output_height;
|
251 |
+
convolution_op.output_width = output_width;
|
252 |
+
convolution_op.kernel_height = kernel_height;
|
253 |
+
convolution_op.kernel_width = kernel_width;
|
254 |
+
convolution_op.stride_height = subsampling;
|
255 |
+
convolution_op.stride_width = subsampling;
|
256 |
+
convolution_op.dilation_height = dilation;
|
257 |
+
convolution_op.dilation_width = dilation;
|
258 |
+
convolution_op.padding_top = padding_top;
|
259 |
+
convolution_op.padding_left = padding_left;
|
260 |
+
|
261 |
+
xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, tile_size, XNN_LOG2_SIZEOF_HALF);
|
262 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
263 |
+
std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
|
264 |
+
}
|
265 |
+
|
266 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
267 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
268 |
+
|
269 |
+
xnn_f16_minmax_params params;
|
270 |
+
init_params(¶ms, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
271 |
+
|
272 |
+
const int input_advanced = tile_size - last_pass_tile;
|
273 |
+
const int input_stride_elements = kernel_height * step_width - input_advanced;
|
274 |
+
size_t buffer_index = 0;
|
275 |
+
for (auto _ : state) {
|
276 |
+
state.PauseTiming();
|
277 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
278 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
279 |
+
state.ResumeTiming();
|
280 |
+
|
281 |
+
for (size_t y = 0; y < output_height; y++) {
|
282 |
+
dwconv(channels, output_width,
|
283 |
+
reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
|
284 |
+
w.data() + buffer_index * w_elements,
|
285 |
+
c.data() + buffer_index * c_elements + y * output_width * channels,
|
286 |
+
input_stride_elements * sizeof(void*), 0,
|
287 |
+
0, z.data(), kernel_size, buffer.data(), ¶ms);
|
288 |
+
}
|
289 |
+
}
|
290 |
+
|
291 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
292 |
+
if (cpu_frequency != 0) {
|
293 |
+
state.counters["cpufreq"] = cpu_frequency;
|
294 |
+
}
|
295 |
+
|
296 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
297 |
+
uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
|
298 |
+
|
299 |
+
state.counters["bytes"] = benchmark::Counter(
|
300 |
+
uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
|
301 |
+
benchmark::Counter::kIsRate);
|
302 |
+
}
|
303 |
+
|
304 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
305 |
+
static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
306 |
+
f16_dwconv(state,
|
307 |
+
xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2,
|
308 |
+
xnn_init_f16_minmax_fp16arith_params,
|
309 |
+
8, 4, benchmark::utils::CheckNEONFP16ARITH);
|
310 |
+
}
|
311 |
+
|
312 |
+
static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, const char* net) {
|
313 |
+
f16_dwconv(state,
|
314 |
+
xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith,
|
315 |
+
xnn_init_f16_minmax_fp16arith_params,
|
316 |
+
8, 4, benchmark::utils::CheckNEONFP16ARITH);
|
317 |
+
}
|
318 |
+
|
319 |
+
static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
320 |
+
f16_dwconv(state,
|
321 |
+
xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2,
|
322 |
+
xnn_init_f16_minmax_fp16arith_params,
|
323 |
+
8, 9, benchmark::utils::CheckNEONFP16ARITH);
|
324 |
+
}
|
325 |
+
|
326 |
+
static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, const char* net) {
|
327 |
+
f16_dwconv(state,
|
328 |
+
xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith,
|
329 |
+
xnn_init_f16_minmax_fp16arith_params,
|
330 |
+
8, 9, benchmark::utils::CheckNEONFP16ARITH);
|
331 |
+
}
|
332 |
+
|
333 |
+
static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
334 |
+
f16_dwconv(state,
|
335 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2,
|
336 |
+
xnn_init_f16_minmax_fp16arith_params,
|
337 |
+
8, 25, benchmark::utils::CheckNEONFP16ARITH);
|
338 |
+
}
|
339 |
+
|
340 |
+
static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, const char* net) {
|
341 |
+
f16_dwconv(state,
|
342 |
+
xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith,
|
343 |
+
xnn_init_f16_minmax_fp16arith_params,
|
344 |
+
8, 25, benchmark::utils::CheckNEONFP16ARITH);
|
345 |
+
}
|
346 |
+
|
347 |
+
static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
348 |
+
f16_dwconv(state,
|
349 |
+
xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2,
|
350 |
+
xnn_init_f16_minmax_fp16arith_params,
|
351 |
+
16, 4, benchmark::utils::CheckNEONFP16ARITH);
|
352 |
+
}
|
353 |
+
|
354 |
+
static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, const char* net) {
|
355 |
+
f16_dwconv(state,
|
356 |
+
xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith,
|
357 |
+
xnn_init_f16_minmax_fp16arith_params,
|
358 |
+
16, 4, benchmark::utils::CheckNEONFP16ARITH);
|
359 |
+
}
|
360 |
+
|
361 |
+
static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
362 |
+
f16_dwconv(state,
|
363 |
+
xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2,
|
364 |
+
xnn_init_f16_minmax_fp16arith_params,
|
365 |
+
16, 9, benchmark::utils::CheckNEONFP16ARITH);
|
366 |
+
}
|
367 |
+
|
368 |
+
static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, const char* net) {
|
369 |
+
f16_dwconv(state,
|
370 |
+
xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith,
|
371 |
+
xnn_init_f16_minmax_fp16arith_params,
|
372 |
+
16, 9, benchmark::utils::CheckNEONFP16ARITH);
|
373 |
+
}
|
374 |
+
|
375 |
+
static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
376 |
+
f16_dwconv(state,
|
377 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2,
|
378 |
+
xnn_init_f16_minmax_fp16arith_params,
|
379 |
+
16, 25, benchmark::utils::CheckNEONFP16ARITH);
|
380 |
+
}
|
381 |
+
|
382 |
+
static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, const char* net) {
|
383 |
+
f16_dwconv(state,
|
384 |
+
xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith,
|
385 |
+
xnn_init_f16_minmax_fp16arith_params,
|
386 |
+
16, 25, benchmark::utils::CheckNEONFP16ARITH);
|
387 |
+
}
|
388 |
+
|
389 |
+
static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
390 |
+
f16_dwconv(state,
|
391 |
+
xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2,
|
392 |
+
xnn_init_f16_minmax_fp16arith_params,
|
393 |
+
32, 4, benchmark::utils::CheckNEONFP16ARITH);
|
394 |
+
}
|
395 |
+
|
396 |
+
static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, const char* net) {
|
397 |
+
f16_dwconv(state,
|
398 |
+
xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith,
|
399 |
+
xnn_init_f16_minmax_fp16arith_params,
|
400 |
+
32, 4, benchmark::utils::CheckNEONFP16ARITH);
|
401 |
+
}
|
402 |
+
|
403 |
+
static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
404 |
+
f16_dwconv(state,
|
405 |
+
xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2,
|
406 |
+
xnn_init_f16_minmax_fp16arith_params,
|
407 |
+
32, 9, benchmark::utils::CheckNEONFP16ARITH);
|
408 |
+
}
|
409 |
+
|
410 |
+
static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, const char* net) {
|
411 |
+
f16_dwconv(state,
|
412 |
+
xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith,
|
413 |
+
xnn_init_f16_minmax_fp16arith_params,
|
414 |
+
32, 9, benchmark::utils::CheckNEONFP16ARITH);
|
415 |
+
}
|
416 |
+
|
417 |
+
static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
418 |
+
f16_dwconv(state,
|
419 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2,
|
420 |
+
xnn_init_f16_minmax_fp16arith_params,
|
421 |
+
32, 25, benchmark::utils::CheckNEONFP16ARITH);
|
422 |
+
}
|
423 |
+
|
424 |
+
static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, const char* net) {
|
425 |
+
f16_dwconv(state,
|
426 |
+
xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith,
|
427 |
+
xnn_init_f16_minmax_fp16arith_params,
|
428 |
+
32, 25, benchmark::utils::CheckNEONFP16ARITH);
|
429 |
+
}
|
430 |
+
|
431 |
+
static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
432 |
+
f16_dwconv(
|
433 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
434 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
435 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
436 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
437 |
+
}
|
438 |
+
static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
439 |
+
f16_dwconv(
|
440 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
441 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
442 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
443 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
444 |
+
}
|
445 |
+
static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
446 |
+
f16_dwconv(
|
447 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
448 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
449 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
450 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
451 |
+
}
|
452 |
+
static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
453 |
+
f16_dwconv(
|
454 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
455 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
456 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
457 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
458 |
+
}
|
459 |
+
static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
460 |
+
f16_dwconv(
|
461 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
462 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
463 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
464 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
465 |
+
}
|
466 |
+
static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
467 |
+
f16_dwconv(
|
468 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
469 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
470 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
471 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
472 |
+
}
|
473 |
+
|
474 |
+
static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
475 |
+
f16_dwconv(
|
476 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
477 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
478 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
479 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
480 |
+
}
|
481 |
+
static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
482 |
+
f16_dwconv(
|
483 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
484 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
485 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
486 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
487 |
+
}
|
488 |
+
static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
489 |
+
f16_dwconv(
|
490 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
491 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
492 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
493 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
494 |
+
}
|
495 |
+
static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
496 |
+
f16_dwconv(
|
497 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
498 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
499 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
500 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
501 |
+
}
|
502 |
+
static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
503 |
+
f16_dwconv(
|
504 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
505 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
506 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
507 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
508 |
+
}
|
509 |
+
static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
510 |
+
f16_dwconv(
|
511 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
512 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
513 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
514 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
515 |
+
}
|
516 |
+
|
517 |
+
static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
518 |
+
f16_dwconv(
|
519 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
520 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
521 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
522 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
523 |
+
}
|
524 |
+
static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
525 |
+
f16_dwconv(
|
526 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
527 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
528 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
529 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
530 |
+
}
|
531 |
+
static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
532 |
+
f16_dwconv(
|
533 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
534 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
535 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
536 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
537 |
+
}
|
538 |
+
static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
539 |
+
f16_dwconv(
|
540 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
541 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
542 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
543 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
544 |
+
}
|
545 |
+
static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
|
546 |
+
f16_dwconv(
|
547 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
|
548 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
549 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
550 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
551 |
+
}
|
552 |
+
static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
|
553 |
+
f16_dwconv(
|
554 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
|
555 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
556 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
557 |
+
/*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
|
558 |
+
}
|
559 |
+
|
560 |
+
BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith_acc2)
|
561 |
+
BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith)
|
562 |
+
BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith_acc2)
|
563 |
+
BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith)
|
564 |
+
BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith_acc2)
|
565 |
+
BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith)
|
566 |
+
BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith_acc2)
|
567 |
+
BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith)
|
568 |
+
BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith_acc2)
|
569 |
+
BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith)
|
570 |
+
BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith_acc2)
|
571 |
+
BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith)
|
572 |
+
BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith_acc2)
|
573 |
+
BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith)
|
574 |
+
BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith_acc2)
|
575 |
+
BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith)
|
576 |
+
BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith_acc2)
|
577 |
+
BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith)
|
578 |
+
|
579 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
|
580 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
|
581 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
|
582 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
|
583 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
|
584 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
|
585 |
+
|
586 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
|
587 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
|
588 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
|
589 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
|
590 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
|
591 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
|
592 |
+
|
593 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
|
594 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
|
595 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
|
596 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
|
597 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
|
598 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
|
599 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
600 |
+
|
601 |
+
|
602 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
603 |
+
static void f16_dwconv_25p8c__fma3(benchmark::State& state, const char* net) {
|
604 |
+
f16_dwconv(
|
605 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
|
606 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
607 |
+
}
|
608 |
+
static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, const char* net) {
|
609 |
+
f16_dwconv(
|
610 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
611 |
+
/*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
612 |
+
}
|
613 |
+
static void f16_dwconv_25p16c__fma3(benchmark::State& state, const char* net) {
|
614 |
+
f16_dwconv(
|
615 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
|
616 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
617 |
+
}
|
618 |
+
static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, const char* net) {
|
619 |
+
f16_dwconv(
|
620 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
621 |
+
/*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
622 |
+
}
|
623 |
+
static void f16_dwconv_25p32c__fma3(benchmark::State& state, const char* net) {
|
624 |
+
f16_dwconv(
|
625 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
|
626 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
627 |
+
}
|
628 |
+
static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, const char* net) {
|
629 |
+
f16_dwconv(
|
630 |
+
state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
631 |
+
/*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
|
632 |
+
}
|
633 |
+
|
634 |
+
static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, const char* net) {
|
635 |
+
f16_dwconv(
|
636 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
637 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
638 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
639 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
640 |
+
}
|
641 |
+
static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
642 |
+
f16_dwconv(
|
643 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
644 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
645 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
646 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
647 |
+
}
|
648 |
+
static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, const char* net) {
|
649 |
+
f16_dwconv(
|
650 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
651 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
652 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
653 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
654 |
+
}
|
655 |
+
static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
656 |
+
f16_dwconv(
|
657 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
658 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
659 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
660 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
661 |
+
}
|
662 |
+
static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, const char* net) {
|
663 |
+
f16_dwconv(
|
664 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
665 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
666 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
667 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
668 |
+
}
|
669 |
+
static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
670 |
+
f16_dwconv(
|
671 |
+
state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
672 |
+
/*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
|
673 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
674 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
675 |
+
}
|
676 |
+
|
677 |
+
static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, const char* net) {
|
678 |
+
f16_dwconv(
|
679 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
680 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
681 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
682 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
683 |
+
}
|
684 |
+
static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
685 |
+
f16_dwconv(
|
686 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
687 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
688 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
689 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
690 |
+
}
|
691 |
+
static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, const char* net) {
|
692 |
+
f16_dwconv(
|
693 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
694 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
695 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
696 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
697 |
+
}
|
698 |
+
static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
699 |
+
f16_dwconv(
|
700 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
701 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
702 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
703 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
704 |
+
}
|
705 |
+
static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, const char* net) {
|
706 |
+
f16_dwconv(
|
707 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
708 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
709 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
710 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
711 |
+
}
|
712 |
+
static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
713 |
+
f16_dwconv(
|
714 |
+
state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
715 |
+
/*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
|
716 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
717 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
718 |
+
}
|
719 |
+
|
720 |
+
static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, const char* net) {
|
721 |
+
f16_dwconv(
|
722 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
723 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
724 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
725 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
726 |
+
}
|
727 |
+
static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
728 |
+
f16_dwconv(
|
729 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
730 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
731 |
+
/*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
|
732 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
733 |
+
}
|
734 |
+
static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, const char* net) {
|
735 |
+
f16_dwconv(
|
736 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
737 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
738 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
739 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
740 |
+
}
|
741 |
+
static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
742 |
+
f16_dwconv(
|
743 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
744 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
745 |
+
/*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
|
746 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
747 |
+
}
|
748 |
+
static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, const char* net) {
|
749 |
+
f16_dwconv(
|
750 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
|
751 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
752 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
753 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
754 |
+
}
|
755 |
+
static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
|
756 |
+
f16_dwconv(
|
757 |
+
state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
|
758 |
+
/*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
|
759 |
+
/*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
|
760 |
+
/*isa_check=*/benchmark::utils::CheckFMA3);
|
761 |
+
}
|
762 |
+
|
763 |
+
BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3)
|
764 |
+
BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3_acc2)
|
765 |
+
BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3)
|
766 |
+
BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3_acc2)
|
767 |
+
BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3)
|
768 |
+
BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3_acc2)
|
769 |
+
|
770 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3)
|
771 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
|
772 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3)
|
773 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
|
774 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3)
|
775 |
+
BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
|
776 |
+
|
777 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3)
|
778 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
|
779 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3)
|
780 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
|
781 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3)
|
782 |
+
BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
|
783 |
+
|
784 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3)
|
785 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
|
786 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3)
|
787 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
|
788 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3)
|
789 |
+
BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
|
790 |
+
|
791 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
792 |
+
|
793 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
794 |
+
BENCHMARK_MAIN();
|
795 |
+
#endif
|
bench/f16-dwconv2d-chw.cc
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/dwconv.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/dwconv.h>
|
22 |
+
#include <xnnpack/indirection.h>
|
23 |
+
#include <xnnpack/microfnptr.h>
|
24 |
+
#include <xnnpack/microparams-init.h>
|
25 |
+
#include <xnnpack/operator.h>
|
26 |
+
#include <xnnpack/pack.h>
|
27 |
+
|
28 |
+
|
29 |
+
static void f16_dwconv2d_chw(benchmark::State& state,
|
30 |
+
xnn_f16_dwconv2d_chw_ukernel_fn dwconv,
|
31 |
+
xnn_init_f16_chw_params_fn init_params,
|
32 |
+
uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s,
|
33 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
34 |
+
{
|
35 |
+
if ((isa_check != nullptr) && !isa_check(state)) {
|
36 |
+
return;
|
37 |
+
}
|
38 |
+
|
39 |
+
const size_t input_height = state.range(0);
|
40 |
+
const size_t input_width = state.range(1);
|
41 |
+
const size_t kernel_height = state.range(2);
|
42 |
+
const size_t kernel_width = state.range(3);
|
43 |
+
const size_t padding_height = state.range(4);
|
44 |
+
const size_t padding_width = state.range(5);
|
45 |
+
const size_t subsampling = state.range(6);
|
46 |
+
const size_t dilation = state.range(7);
|
47 |
+
const size_t channels = state.range(8);
|
48 |
+
|
49 |
+
if (kernel_height != kh) {
|
50 |
+
state.SkipWithError("kernel height mismatch");
|
51 |
+
return;
|
52 |
+
}
|
53 |
+
|
54 |
+
if (kernel_width != kw) {
|
55 |
+
state.SkipWithError("kernel width mismatch");
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
|
59 |
+
if (subsampling != s) {
|
60 |
+
state.SkipWithError("subsampling mismatch");
|
61 |
+
return;
|
62 |
+
}
|
63 |
+
|
64 |
+
if (padding_width % 2 != 0 || padding_width / 2 != pw) {
|
65 |
+
state.SkipWithError("padding width mismatch");
|
66 |
+
return;
|
67 |
+
}
|
68 |
+
|
69 |
+
if (dilation != 1) {
|
70 |
+
state.SkipWithError("unsupported dilation");
|
71 |
+
return;
|
72 |
+
}
|
73 |
+
|
74 |
+
std::random_device random_device;
|
75 |
+
auto rng = std::mt19937(random_device());
|
76 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
|
77 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
78 |
+
|
79 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
80 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
81 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
82 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
83 |
+
|
84 |
+
const size_t inputSize = (input_height + padding_height) * input_width;
|
85 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
86 |
+
const size_t output_size = output_height * output_width;
|
87 |
+
|
88 |
+
std::vector<uint16_t> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
|
89 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
90 |
+
std::vector<uint16_t> bias(channels);
|
91 |
+
std::generate(bias.begin(), bias.end(), std::ref(f16rng));
|
92 |
+
std::vector<uint16_t> kernel(channels * kernel_size);
|
93 |
+
std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
|
94 |
+
std::vector<uint16_t> zero(input_width + padding_width);
|
95 |
+
|
96 |
+
const size_t w_elements = (kernel_size + 1) * channels;
|
97 |
+
const size_t o_elements = output_size * channels;
|
98 |
+
const size_t num_buffers = 1 +
|
99 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
100 |
+
sizeof(uint16_t) * (w_elements + o_elements));
|
101 |
+
|
102 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(w_elements * num_buffers);
|
103 |
+
std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
|
104 |
+
for (size_t c = 0; c < channels; c++) {
|
105 |
+
packed_weights[c * kernel_size + c] = bias[c];
|
106 |
+
for (size_t i = 0; i < kernel_size; i++) {
|
107 |
+
packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
|
108 |
+
}
|
109 |
+
}
|
110 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
111 |
+
std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
|
112 |
+
}
|
113 |
+
|
114 |
+
std::vector<uint16_t> output(o_elements * num_buffers);
|
115 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
116 |
+
|
117 |
+
xnn_f16_chw_params chw_params;
|
118 |
+
init_params(&chw_params,
|
119 |
+
input_width, 0xFC00 /* -inf */, 0x7C00 /* inf */);
|
120 |
+
|
121 |
+
size_t buffer_index = 0;
|
122 |
+
for (auto _ : state) {
|
123 |
+
state.PauseTiming();
|
124 |
+
benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
|
125 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
126 |
+
state.ResumeTiming();
|
127 |
+
|
128 |
+
for (uint32_t channel = 0; channel < channels; channel++) {
|
129 |
+
dwconv(
|
130 |
+
input_height, input_width * sizeof(uint16_t),
|
131 |
+
input.data() + channel * inputSize,
|
132 |
+
packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
|
133 |
+
zero.data(),
|
134 |
+
output.data() + channel * output_size + buffer_index * o_elements,
|
135 |
+
padding_height / 2, // padding_top
|
136 |
+
&chw_params);
|
137 |
+
}
|
138 |
+
}
|
139 |
+
|
140 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
141 |
+
if (cpu_frequency != 0) {
|
142 |
+
state.counters["cpufreq"] = cpu_frequency;
|
143 |
+
}
|
144 |
+
|
145 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
146 |
+
uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
|
147 |
+
benchmark::Counter::kIsRate);
|
148 |
+
|
149 |
+
state.counters["bytes"] = benchmark::Counter(
|
150 |
+
uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
|
151 |
+
benchmark::Counter::kIsRate);
|
152 |
+
}
|
153 |
+
|
154 |
+
|
155 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
156 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
|
157 |
+
f16_dwconv2d_chw(state,
|
158 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8,
|
159 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
160 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
161 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
162 |
+
}
|
163 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
|
164 |
+
f16_dwconv2d_chw(state,
|
165 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
|
166 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
167 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
168 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
169 |
+
}
|
170 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
|
171 |
+
f16_dwconv2d_chw(state,
|
172 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_3x8,
|
173 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
174 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
175 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
176 |
+
}
|
177 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
|
178 |
+
f16_dwconv2d_chw(state,
|
179 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_4x8,
|
180 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
181 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
182 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
183 |
+
}
|
184 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_5x8(benchmark::State& state, const char* net) {
|
185 |
+
f16_dwconv2d_chw(state,
|
186 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_5x8,
|
187 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
188 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
189 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
190 |
+
}
|
191 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_6x8(benchmark::State& state, const char* net) {
|
192 |
+
f16_dwconv2d_chw(state,
|
193 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_6x8,
|
194 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
195 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
196 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
197 |
+
}
|
198 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
|
199 |
+
f16_dwconv2d_chw(state,
|
200 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc2,
|
201 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
202 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
203 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
204 |
+
}
|
205 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
|
206 |
+
f16_dwconv2d_chw(state,
|
207 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc3,
|
208 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
209 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
210 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
211 |
+
}
|
212 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
|
213 |
+
f16_dwconv2d_chw(state,
|
214 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc4,
|
215 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
216 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
217 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
218 |
+
}
|
219 |
+
static void dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
|
220 |
+
f16_dwconv2d_chw(state,
|
221 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8_acc2,
|
222 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
223 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
|
224 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
225 |
+
}
|
226 |
+
|
227 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
|
228 |
+
f16_dwconv2d_chw(state,
|
229 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8,
|
230 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
231 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
232 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
233 |
+
}
|
234 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
|
235 |
+
f16_dwconv2d_chw(state,
|
236 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8,
|
237 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
238 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
239 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
240 |
+
}
|
241 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
|
242 |
+
f16_dwconv2d_chw(state,
|
243 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x8,
|
244 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
245 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
246 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
247 |
+
}
|
248 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
|
249 |
+
f16_dwconv2d_chw(state,
|
250 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_4x8,
|
251 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
252 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
253 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
254 |
+
}
|
255 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
|
256 |
+
f16_dwconv2d_chw(state,
|
257 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc2,
|
258 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
259 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
260 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
261 |
+
}
|
262 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
|
263 |
+
f16_dwconv2d_chw(state,
|
264 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc3,
|
265 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
266 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
267 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
268 |
+
}
|
269 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
|
270 |
+
f16_dwconv2d_chw(state,
|
271 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc4,
|
272 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
273 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
274 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
275 |
+
}
|
276 |
+
static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
|
277 |
+
f16_dwconv2d_chw(state,
|
278 |
+
xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8_acc2,
|
279 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
280 |
+
3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
|
281 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
282 |
+
}
|
283 |
+
|
284 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
|
285 |
+
f16_dwconv2d_chw(state,
|
286 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8,
|
287 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
288 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
289 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
290 |
+
}
|
291 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
|
292 |
+
f16_dwconv2d_chw(state,
|
293 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8,
|
294 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
295 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
296 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
297 |
+
}
|
298 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
|
299 |
+
f16_dwconv2d_chw(state,
|
300 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8,
|
301 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
302 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
303 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
304 |
+
}
|
305 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_4x8(benchmark::State& state, const char* net) {
|
306 |
+
f16_dwconv2d_chw(state,
|
307 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8,
|
308 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
309 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
310 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
311 |
+
}
|
312 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_5x8(benchmark::State& state, const char* net) {
|
313 |
+
f16_dwconv2d_chw(state,
|
314 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_5x8,
|
315 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
316 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
317 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
318 |
+
}
|
319 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
|
320 |
+
f16_dwconv2d_chw(state,
|
321 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc2,
|
322 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
323 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
324 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
325 |
+
}
|
326 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
|
327 |
+
f16_dwconv2d_chw(state,
|
328 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc3,
|
329 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
330 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
331 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
332 |
+
}
|
333 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
|
334 |
+
f16_dwconv2d_chw(state,
|
335 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc4,
|
336 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
337 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
338 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
339 |
+
}
|
340 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
|
341 |
+
f16_dwconv2d_chw(state,
|
342 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc5,
|
343 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
344 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
345 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
346 |
+
}
|
347 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
|
348 |
+
f16_dwconv2d_chw(state,
|
349 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc2,
|
350 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
351 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
352 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
353 |
+
}
|
354 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
|
355 |
+
f16_dwconv2d_chw(state,
|
356 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc3,
|
357 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
358 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
359 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
360 |
+
}
|
361 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
|
362 |
+
f16_dwconv2d_chw(state,
|
363 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8_acc2,
|
364 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
365 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
366 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
367 |
+
}
|
368 |
+
static void dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2(benchmark::State& state, const char* net) {
|
369 |
+
f16_dwconv2d_chw(state,
|
370 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8_acc2,
|
371 |
+
xnn_init_f16_chw_neonfp16arith_stride1_params,
|
372 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
|
373 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
374 |
+
}
|
375 |
+
|
376 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
|
377 |
+
f16_dwconv2d_chw(state,
|
378 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8,
|
379 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
380 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
381 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
382 |
+
}
|
383 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
|
384 |
+
f16_dwconv2d_chw(state,
|
385 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8,
|
386 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
387 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
388 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
389 |
+
}
|
390 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
|
391 |
+
f16_dwconv2d_chw(state,
|
392 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_3x8,
|
393 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
394 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
395 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
396 |
+
}
|
397 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
|
398 |
+
f16_dwconv2d_chw(state,
|
399 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc2,
|
400 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
401 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
402 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
403 |
+
}
|
404 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
|
405 |
+
f16_dwconv2d_chw(state,
|
406 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc3,
|
407 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
408 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
409 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
410 |
+
}
|
411 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
|
412 |
+
f16_dwconv2d_chw(state,
|
413 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc4,
|
414 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
415 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
416 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
417 |
+
}
|
418 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
|
419 |
+
f16_dwconv2d_chw(state,
|
420 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
|
421 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
422 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
423 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
424 |
+
}
|
425 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
|
426 |
+
f16_dwconv2d_chw(state,
|
427 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc2,
|
428 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
429 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
430 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
431 |
+
}
|
432 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
|
433 |
+
f16_dwconv2d_chw(state,
|
434 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc3,
|
435 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
436 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
437 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
438 |
+
}
|
439 |
+
static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
|
440 |
+
f16_dwconv2d_chw(state,
|
441 |
+
xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
|
442 |
+
xnn_init_f16_chw_neonfp16arith_stride2_params,
|
443 |
+
5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
|
444 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
445 |
+
}
|
446 |
+
|
447 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8)
|
448 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8)
|
449 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_3x8)
|
450 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_4x8)
|
451 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_5x8)
|
452 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_6x8)
|
453 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2)
|
454 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3)
|
455 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4)
|
456 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2)
|
457 |
+
|
458 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8)
|
459 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8)
|
460 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_3x8)
|
461 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_4x8)
|
462 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2)
|
463 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3)
|
464 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4)
|
465 |
+
BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2)
|
466 |
+
|
467 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8)
|
468 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8)
|
469 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8)
|
470 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8)
|
471 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_5x8)
|
472 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2)
|
473 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3)
|
474 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4)
|
475 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5)
|
476 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2)
|
477 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3)
|
478 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2)
|
479 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2)
|
480 |
+
|
481 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8)
|
482 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8)
|
483 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8)
|
484 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2)
|
485 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3)
|
486 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4)
|
487 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5)
|
488 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2)
|
489 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3)
|
490 |
+
BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2)
|
491 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
492 |
+
|
493 |
+
|
494 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
495 |
+
BENCHMARK_MAIN();
|
496 |
+
#endif
|
bench/f16-f32-vcvt.cc
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2021 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vcvt.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_f32_vcvt(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_f32_vcvt_ukernel_fn cvt,
|
27 |
+
xnn_init_f16_f32_cvt_params_fn init_params = nullptr,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t num_elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
42 |
+
std::vector<float, AlignedAllocator<float, 64>> y(num_elements);
|
43 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
44 |
+
std::fill(y.begin(), y.end(), std::nanf(""));
|
45 |
+
|
46 |
+
xnn_f16_f32_cvt_params params;
|
47 |
+
if (init_params != nullptr) {
|
48 |
+
init_params(¶ms);
|
49 |
+
}
|
50 |
+
for (auto _ : state) {
|
51 |
+
cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
|
52 |
+
}
|
53 |
+
|
54 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
55 |
+
if (cpu_frequency != 0) {
|
56 |
+
state.counters["cpufreq"] = cpu_frequency;
|
57 |
+
}
|
58 |
+
|
59 |
+
const size_t elements_per_iteration = num_elements;
|
60 |
+
state.counters["elements"] =
|
61 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
62 |
+
|
63 |
+
const size_t bytes_per_iteration = num_elements * (sizeof(uint16_t) + sizeof(float));
|
64 |
+
state.counters["bytes"] =
|
65 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
66 |
+
}
|
67 |
+
|
68 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
69 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x8,
|
70 |
+
xnn_f16_f32_vcvt_ukernel__neonfp16_x8,
|
71 |
+
nullptr /* init params */,
|
72 |
+
benchmark::utils::CheckNEONFP16)
|
73 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
74 |
+
->UseRealTime();
|
75 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x16,
|
76 |
+
xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
|
77 |
+
nullptr /* init params */,
|
78 |
+
benchmark::utils::CheckNEONFP16)
|
79 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
80 |
+
->UseRealTime();
|
81 |
+
|
82 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x8,
|
83 |
+
xnn_f16_f32_vcvt_ukernel__neon_int16_x8,
|
84 |
+
xnn_init_f16_f32_cvt_neon_params,
|
85 |
+
benchmark::utils::CheckNEON)
|
86 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
87 |
+
->UseRealTime();
|
88 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x16,
|
89 |
+
xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
|
90 |
+
xnn_init_f16_f32_cvt_neon_params,
|
91 |
+
benchmark::utils::CheckNEON)
|
92 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
93 |
+
->UseRealTime();
|
94 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x24,
|
95 |
+
xnn_f16_f32_vcvt_ukernel__neon_int16_x24,
|
96 |
+
xnn_init_f16_f32_cvt_neon_params,
|
97 |
+
benchmark::utils::CheckNEON)
|
98 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
99 |
+
->UseRealTime();
|
100 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x32,
|
101 |
+
xnn_f16_f32_vcvt_ukernel__neon_int16_x32,
|
102 |
+
xnn_init_f16_f32_cvt_neon_params,
|
103 |
+
benchmark::utils::CheckNEON)
|
104 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
105 |
+
->UseRealTime();
|
106 |
+
|
107 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x8,
|
108 |
+
xnn_f16_f32_vcvt_ukernel__neon_int32_x8,
|
109 |
+
xnn_init_f16_f32_cvt_neon_params,
|
110 |
+
benchmark::utils::CheckNEON)
|
111 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
112 |
+
->UseRealTime();
|
113 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x16,
|
114 |
+
xnn_f16_f32_vcvt_ukernel__neon_int32_x16,
|
115 |
+
xnn_init_f16_f32_cvt_neon_params,
|
116 |
+
benchmark::utils::CheckNEON)
|
117 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
118 |
+
->UseRealTime();
|
119 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x24,
|
120 |
+
xnn_f16_f32_vcvt_ukernel__neon_int32_x24,
|
121 |
+
xnn_init_f16_f32_cvt_neon_params,
|
122 |
+
benchmark::utils::CheckNEON)
|
123 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
124 |
+
->UseRealTime();
|
125 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x32,
|
126 |
+
xnn_f16_f32_vcvt_ukernel__neon_int32_x32,
|
127 |
+
xnn_init_f16_f32_cvt_neon_params,
|
128 |
+
benchmark::utils::CheckNEON)
|
129 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
130 |
+
->UseRealTime();
|
131 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
132 |
+
|
133 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
134 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x16,
|
135 |
+
xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
|
136 |
+
nullptr /* init params */,
|
137 |
+
benchmark::utils::CheckAVX512SKX)
|
138 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
139 |
+
->UseRealTime();
|
140 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x32,
|
141 |
+
xnn_f16_f32_vcvt_ukernel__avx512skx_x32,
|
142 |
+
nullptr /* init params */,
|
143 |
+
benchmark::utils::CheckAVX512SKX)
|
144 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
145 |
+
->UseRealTime();
|
146 |
+
|
147 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x8,
|
148 |
+
xnn_f16_f32_vcvt_ukernel__f16c_x8,
|
149 |
+
nullptr /* init params */,
|
150 |
+
benchmark::utils::CheckF16C)
|
151 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
152 |
+
->UseRealTime();
|
153 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x16,
|
154 |
+
xnn_f16_f32_vcvt_ukernel__f16c_x16,
|
155 |
+
nullptr /* init params */,
|
156 |
+
benchmark::utils::CheckF16C)
|
157 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
158 |
+
->UseRealTime();
|
159 |
+
|
160 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x8,
|
161 |
+
xnn_f16_f32_vcvt_ukernel__avx_int16_x8,
|
162 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
163 |
+
benchmark::utils::CheckAVX)
|
164 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
165 |
+
->UseRealTime();
|
166 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x16,
|
167 |
+
xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
|
168 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
169 |
+
benchmark::utils::CheckAVX)
|
170 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
171 |
+
->UseRealTime();
|
172 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x24,
|
173 |
+
xnn_f16_f32_vcvt_ukernel__avx_int16_x24,
|
174 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
175 |
+
benchmark::utils::CheckAVX)
|
176 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
177 |
+
->UseRealTime();
|
178 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x32,
|
179 |
+
xnn_f16_f32_vcvt_ukernel__avx_int16_x32,
|
180 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
181 |
+
benchmark::utils::CheckAVX)
|
182 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
183 |
+
->UseRealTime();
|
184 |
+
|
185 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x8,
|
186 |
+
xnn_f16_f32_vcvt_ukernel__avx_int32_x8,
|
187 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
188 |
+
benchmark::utils::CheckAVX)
|
189 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
190 |
+
->UseRealTime();
|
191 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x16,
|
192 |
+
xnn_f16_f32_vcvt_ukernel__avx_int32_x16,
|
193 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
194 |
+
benchmark::utils::CheckAVX)
|
195 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
196 |
+
->UseRealTime();
|
197 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x24,
|
198 |
+
xnn_f16_f32_vcvt_ukernel__avx_int32_x24,
|
199 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
200 |
+
benchmark::utils::CheckAVX)
|
201 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
202 |
+
->UseRealTime();
|
203 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x32,
|
204 |
+
xnn_f16_f32_vcvt_ukernel__avx_int32_x32,
|
205 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
206 |
+
benchmark::utils::CheckAVX)
|
207 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
208 |
+
->UseRealTime();
|
209 |
+
|
210 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x8,
|
211 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int16_x8,
|
212 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
213 |
+
benchmark::utils::CheckSSE41)
|
214 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
215 |
+
->UseRealTime();
|
216 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x16,
|
217 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
|
218 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
219 |
+
benchmark::utils::CheckSSE41)
|
220 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
221 |
+
->UseRealTime();
|
222 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x24,
|
223 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int16_x24,
|
224 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
225 |
+
benchmark::utils::CheckSSE41)
|
226 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
227 |
+
->UseRealTime();
|
228 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x32,
|
229 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int16_x32,
|
230 |
+
xnn_init_f16_f32_cvt_sse_int16_params,
|
231 |
+
benchmark::utils::CheckSSE41)
|
232 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
233 |
+
->UseRealTime();
|
234 |
+
|
235 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x8,
|
236 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int32_x8,
|
237 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
238 |
+
benchmark::utils::CheckSSE41)
|
239 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
240 |
+
->UseRealTime();
|
241 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x16,
|
242 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int32_x16,
|
243 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
244 |
+
benchmark::utils::CheckSSE41)
|
245 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
246 |
+
->UseRealTime();
|
247 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x24,
|
248 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int32_x24,
|
249 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
250 |
+
benchmark::utils::CheckSSE41)
|
251 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
252 |
+
->UseRealTime();
|
253 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x32,
|
254 |
+
xnn_f16_f32_vcvt_ukernel__sse41_int32_x32,
|
255 |
+
xnn_init_f16_f32_cvt_sse_int32_params,
|
256 |
+
benchmark::utils::CheckSSE41)
|
257 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
258 |
+
->UseRealTime();
|
259 |
+
|
260 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x8,
|
261 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int16_x8,
|
262 |
+
xnn_init_f16_f32_cvt_sse_int16_params)
|
263 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
264 |
+
->UseRealTime();
|
265 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x16,
|
266 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int16_x16,
|
267 |
+
xnn_init_f16_f32_cvt_sse_int16_params)
|
268 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
269 |
+
->UseRealTime();
|
270 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x24,
|
271 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int16_x24,
|
272 |
+
xnn_init_f16_f32_cvt_sse_int16_params)
|
273 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
274 |
+
->UseRealTime();
|
275 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x32,
|
276 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
|
277 |
+
xnn_init_f16_f32_cvt_sse_int16_params)
|
278 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
279 |
+
->UseRealTime();
|
280 |
+
|
281 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x8,
|
282 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int32_x8,
|
283 |
+
xnn_init_f16_f32_cvt_sse_int32_params)
|
284 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
285 |
+
->UseRealTime();
|
286 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x16,
|
287 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int32_x16,
|
288 |
+
xnn_init_f16_f32_cvt_sse_int32_params)
|
289 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
290 |
+
->UseRealTime();
|
291 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x24,
|
292 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int32_x24,
|
293 |
+
xnn_init_f16_f32_cvt_sse_int32_params)
|
294 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
295 |
+
->UseRealTime();
|
296 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x32,
|
297 |
+
xnn_f16_f32_vcvt_ukernel__sse2_int32_x32,
|
298 |
+
xnn_init_f16_f32_cvt_sse_int32_params)
|
299 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
300 |
+
->UseRealTime();
|
301 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
302 |
+
|
303 |
+
#if XNN_ARCH_WASMRELAXEDSIMD
|
304 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x8,
|
305 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x8,
|
306 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
307 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
308 |
+
->UseRealTime();
|
309 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x16,
|
310 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x16,
|
311 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
312 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
313 |
+
->UseRealTime();
|
314 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x24,
|
315 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x24,
|
316 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
317 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
318 |
+
->UseRealTime();
|
319 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x32,
|
320 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x32,
|
321 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
322 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
323 |
+
->UseRealTime();
|
324 |
+
|
325 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x8,
|
326 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x8,
|
327 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
328 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
329 |
+
->UseRealTime();
|
330 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x16,
|
331 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x16,
|
332 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
333 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
334 |
+
->UseRealTime();
|
335 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x24,
|
336 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x24,
|
337 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
338 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
339 |
+
->UseRealTime();
|
340 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x32,
|
341 |
+
xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x32,
|
342 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
343 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
344 |
+
->UseRealTime();
|
345 |
+
#endif // XNN_ARCH_WASMRELAXEDSIMD
|
346 |
+
|
347 |
+
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
|
348 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x8,
|
349 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x8,
|
350 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
351 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
352 |
+
->UseRealTime();
|
353 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x16,
|
354 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
|
355 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
356 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
357 |
+
->UseRealTime();
|
358 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x24,
|
359 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x24,
|
360 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
361 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
362 |
+
->UseRealTime();
|
363 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x32,
|
364 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x32,
|
365 |
+
xnn_init_f16_f32_cvt_wasmsimd_int16_params)
|
366 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
367 |
+
->UseRealTime();
|
368 |
+
|
369 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x8,
|
370 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x8,
|
371 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
372 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
373 |
+
->UseRealTime();
|
374 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x16,
|
375 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x16,
|
376 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
377 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
378 |
+
->UseRealTime();
|
379 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x24,
|
380 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x24,
|
381 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
382 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
383 |
+
->UseRealTime();
|
384 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x32,
|
385 |
+
xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x32,
|
386 |
+
xnn_init_f16_f32_cvt_wasmsimd_int32_params)
|
387 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
388 |
+
->UseRealTime();
|
389 |
+
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
|
390 |
+
|
391 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x1,
|
392 |
+
xnn_f16_f32_vcvt_ukernel__scalar_x1,
|
393 |
+
xnn_init_f16_f32_cvt_scalar_params)
|
394 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
395 |
+
->UseRealTime();
|
396 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x2,
|
397 |
+
xnn_f16_f32_vcvt_ukernel__scalar_x2,
|
398 |
+
xnn_init_f16_f32_cvt_scalar_params)
|
399 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
400 |
+
->UseRealTime();
|
401 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x3,
|
402 |
+
xnn_f16_f32_vcvt_ukernel__scalar_x3,
|
403 |
+
xnn_init_f16_f32_cvt_scalar_params)
|
404 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
405 |
+
->UseRealTime();
|
406 |
+
BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x4,
|
407 |
+
xnn_f16_f32_vcvt_ukernel__scalar_x4,
|
408 |
+
xnn_init_f16_f32_cvt_scalar_params)
|
409 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
|
410 |
+
->UseRealTime();
|
411 |
+
|
412 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
413 |
+
BENCHMARK_MAIN();
|
414 |
+
#endif
|
bench/f16-f32acc-gemm.cc
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cfloat>
|
11 |
+
#include <cmath>
|
12 |
+
#include <functional>
|
13 |
+
#include <random>
|
14 |
+
#include <vector>
|
15 |
+
|
16 |
+
#include <benchmark/benchmark.h>
|
17 |
+
#include <fp16/fp16.h>
|
18 |
+
#include "bench/gemm.h"
|
19 |
+
#include "bench/utils.h"
|
20 |
+
|
21 |
+
#include <xnnpack.h>
|
22 |
+
#include <xnnpack/aligned-allocator.h>
|
23 |
+
#include <xnnpack/common.h>
|
24 |
+
#include <xnnpack/gemm.h>
|
25 |
+
#include <xnnpack/math.h>
|
26 |
+
#include <xnnpack/pack.h>
|
27 |
+
#include <xnnpack/microfnptr.h>
|
28 |
+
#include <xnnpack/microparams-init.h>
|
29 |
+
|
30 |
+
|
31 |
+
static void f16_gemm(benchmark::State& state,
|
32 |
+
xnn_f16_gemm_minmax_ukernel_fn gemm,
|
33 |
+
size_t mr, size_t nr, size_t kr, size_t sr,
|
34 |
+
xnn_init_f16_minmax_params_fn init_params,
|
35 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
36 |
+
{
|
37 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
38 |
+
return;
|
39 |
+
}
|
40 |
+
|
41 |
+
const size_t mc = state.range(0);
|
42 |
+
const size_t nc = state.range(1);
|
43 |
+
const size_t kc = state.range(2);
|
44 |
+
|
45 |
+
const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
|
46 |
+
const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
|
47 |
+
|
48 |
+
std::random_device random_device;
|
49 |
+
auto rng = std::mt19937(random_device());
|
50 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
51 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
52 |
+
|
53 |
+
std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
54 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
55 |
+
std::vector<uint16_t> k(nc * kc);
|
56 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
57 |
+
std::vector<uint16_t> b(nc);
|
58 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
59 |
+
|
60 |
+
const size_t w_elements = nc_stride * kc_stride + nc_stride;
|
61 |
+
const size_t c_elements = mc * nc;
|
62 |
+
const size_t num_buffers = 1 +
|
63 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
64 |
+
sizeof(uint16_t) * (w_elements + c_elements));
|
65 |
+
|
66 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
67 |
+
std::fill(w.begin(), w.end(), 0);
|
68 |
+
xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
|
69 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
70 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
71 |
+
|
72 |
+
// Prepare minmax parameters.
|
73 |
+
xnn_f16_minmax_params params;
|
74 |
+
init_params(¶ms,
|
75 |
+
UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
76 |
+
|
77 |
+
size_t buffer_index = 0;
|
78 |
+
for (auto _ : state) {
|
79 |
+
// Use circular buffers (exceeding cache size) and prefetch to control cache state:
|
80 |
+
// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
|
81 |
+
// - W is not in cache (for any cache level)
|
82 |
+
// - C is not in cache (for any cache level)
|
83 |
+
state.PauseTiming();
|
84 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
85 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
86 |
+
state.ResumeTiming();
|
87 |
+
|
88 |
+
for (uint32_t m = 0; m < mc; m += mr) {
|
89 |
+
const uint32_t mb = min(mc - m, mr);
|
90 |
+
for (uint32_t n = 0; n < nc; n += nr) {
|
91 |
+
const uint32_t nb = min(nc - n, nr);
|
92 |
+
gemm(
|
93 |
+
mb, nb, kc * sizeof(uint16_t),
|
94 |
+
a.data() + m * kc, kc * sizeof(uint16_t),
|
95 |
+
w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
|
96 |
+
c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
|
97 |
+
¶ms);
|
98 |
+
}
|
99 |
+
}
|
100 |
+
}
|
101 |
+
|
102 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
103 |
+
if (cpu_frequency != 0) {
|
104 |
+
state.counters["cpufreq"] = cpu_frequency;
|
105 |
+
}
|
106 |
+
|
107 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
108 |
+
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
|
109 |
+
}
|
110 |
+
|
111 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
112 |
+
static void f16_f32acc_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
113 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
|
114 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
115 |
+
}
|
116 |
+
static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
117 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
|
118 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
119 |
+
}
|
120 |
+
static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
121 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
|
122 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
123 |
+
}
|
124 |
+
static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
125 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
|
126 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
127 |
+
}
|
128 |
+
static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
129 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
|
130 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
131 |
+
}
|
132 |
+
static void f16_f32acc_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
133 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
|
134 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
135 |
+
}
|
136 |
+
static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
137 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
|
138 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
139 |
+
}
|
140 |
+
static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
141 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
|
142 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
143 |
+
}
|
144 |
+
static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
145 |
+
f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
|
146 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
147 |
+
}
|
148 |
+
|
149 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_1x8__avx2_broadcast)
|
150 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_4x8__avx2_broadcast)
|
151 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_5x8__avx2_broadcast)
|
152 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_6x8__avx2_broadcast)
|
153 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_7x8__avx2_broadcast)
|
154 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_1x16__avx2_broadcast)
|
155 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_3x16__avx2_broadcast)
|
156 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_4x16__avx2_broadcast)
|
157 |
+
BENCHMARK_GEMM(f16_f32acc_gemm_5x16__avx2_broadcast)
|
158 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
159 |
+
|
160 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
161 |
+
BENCHMARK_MAIN();
|
162 |
+
#endif
|
bench/f16-f32acc-igemm.cc
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/conv.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/igemm.h>
|
22 |
+
#include <xnnpack/indirection.h>
|
23 |
+
#include <xnnpack/microfnptr.h>
|
24 |
+
#include <xnnpack/microparams-init.h>
|
25 |
+
#include <xnnpack/operator.h>
|
26 |
+
#include <xnnpack/pack.h>
|
27 |
+
|
28 |
+
|
29 |
+
static void f16_igemm(benchmark::State& state,
|
30 |
+
xnn_f16_igemm_minmax_ukernel_fn igemm,
|
31 |
+
uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
|
32 |
+
xnn_init_f16_minmax_params_fn init_params,
|
33 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
34 |
+
{
|
35 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
36 |
+
return;
|
37 |
+
}
|
38 |
+
|
39 |
+
const size_t input_height = state.range(0);
|
40 |
+
const size_t input_width = state.range(1);
|
41 |
+
const size_t kernel_height = state.range(2);
|
42 |
+
const size_t kernel_width = state.range(3);
|
43 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
44 |
+
const size_t padding_height = state.range(4);
|
45 |
+
const size_t padding_width = state.range(5);
|
46 |
+
const size_t subsampling = state.range(6);
|
47 |
+
const size_t dilation = state.range(7);
|
48 |
+
const size_t group_input_channels = state.range(8);
|
49 |
+
const size_t group_output_channels = state.range(9);
|
50 |
+
|
51 |
+
std::random_device random_device;
|
52 |
+
auto rng = std::mt19937(random_device());
|
53 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
54 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
55 |
+
|
56 |
+
const size_t output_pixel_stride = group_output_channels;
|
57 |
+
const size_t input_pixel_stride = group_input_channels;
|
58 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
59 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
60 |
+
const size_t padding_left = padding_width / 2;
|
61 |
+
const size_t padding_top = padding_height / 2;
|
62 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
63 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
64 |
+
const size_t output_size = output_height * output_width;
|
65 |
+
|
66 |
+
const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
|
67 |
+
const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
|
68 |
+
const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
|
69 |
+
|
70 |
+
std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
71 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
72 |
+
std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
|
73 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
74 |
+
std::vector<uint16_t> b(group_output_channels);
|
75 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
76 |
+
|
77 |
+
std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
78 |
+
|
79 |
+
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
|
80 |
+
const size_t i_elements = mc_stride * kernel_size;
|
81 |
+
const size_t c_elements = output_height * output_width * output_pixel_stride;
|
82 |
+
const size_t num_buffers = 1 +
|
83 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
84 |
+
sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
|
85 |
+
|
86 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
87 |
+
std::fill(w.begin(), w.end(), 0);
|
88 |
+
xnn_pack_f16_conv_goki_w(
|
89 |
+
1 /* groups */, group_output_channels, kernel_size, group_input_channels,
|
90 |
+
nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
|
91 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
92 |
+
std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
|
93 |
+
}
|
94 |
+
|
95 |
+
std::vector<const uint16_t*> i(i_elements * num_buffers);
|
96 |
+
xnn_operator convolution_op = { };
|
97 |
+
convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
|
98 |
+
convolution_op.input = a.data();
|
99 |
+
convolution_op.input_pixel_stride = input_pixel_stride;
|
100 |
+
convolution_op.zero_buffer = z.data();
|
101 |
+
convolution_op.groups = 1;
|
102 |
+
convolution_op.group_input_channels = group_input_channels;
|
103 |
+
convolution_op.batch_size = 1;
|
104 |
+
convolution_op.input_height = input_height;
|
105 |
+
convolution_op.input_width = input_width;
|
106 |
+
convolution_op.output_height = output_height;
|
107 |
+
convolution_op.output_width = output_width;
|
108 |
+
convolution_op.kernel_height = kernel_height;
|
109 |
+
convolution_op.kernel_width = kernel_width;
|
110 |
+
convolution_op.stride_height = subsampling;
|
111 |
+
convolution_op.stride_width = subsampling;
|
112 |
+
convolution_op.dilation_height = dilation;
|
113 |
+
convolution_op.dilation_width = dilation;
|
114 |
+
convolution_op.padding_top = padding_top;
|
115 |
+
convolution_op.padding_left = padding_left;
|
116 |
+
xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
|
117 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
118 |
+
std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
|
119 |
+
}
|
120 |
+
|
121 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
122 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
123 |
+
|
124 |
+
// Prepare minmax parameters.
|
125 |
+
xnn_f16_minmax_params params;
|
126 |
+
init_params(¶ms,
|
127 |
+
UINT16_C(0x7C00) /* inf */, UINT16_C(0xFC00) /* -inf */);
|
128 |
+
|
129 |
+
size_t buffer_index = 0;
|
130 |
+
for (auto _ : state) {
|
131 |
+
state.PauseTiming();
|
132 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
133 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
134 |
+
state.ResumeTiming();
|
135 |
+
|
136 |
+
for (uint32_t m = 0; m < output_size; m += mr) {
|
137 |
+
const uint32_t mb = min(output_size - m, mr);
|
138 |
+
for (uint32_t n = 0; n < group_output_channels; n += nr) {
|
139 |
+
const uint32_t nb = min(group_output_channels - n, nr);
|
140 |
+
igemm(
|
141 |
+
mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
|
142 |
+
reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
|
143 |
+
w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
|
144 |
+
c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
|
145 |
+
0, z.data(), ¶ms);
|
146 |
+
}
|
147 |
+
}
|
148 |
+
}
|
149 |
+
|
150 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
151 |
+
if (cpu_frequency != 0) {
|
152 |
+
state.counters["cpufreq"] = cpu_frequency;
|
153 |
+
}
|
154 |
+
|
155 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
156 |
+
uint64_t(state.iterations()) * 2 *
|
157 |
+
output_height * output_width *
|
158 |
+
group_input_channels * group_output_channels *
|
159 |
+
kernel_height * kernel_width,
|
160 |
+
benchmark::Counter::kIsRate);
|
161 |
+
}
|
162 |
+
|
163 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
164 |
+
static void f16_f32acc_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
165 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
|
166 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
167 |
+
}
|
168 |
+
static void f16_f32acc_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
169 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
|
170 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
171 |
+
}
|
172 |
+
static void f16_f32acc_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
173 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
|
174 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
175 |
+
}
|
176 |
+
static void f16_f32acc_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
177 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
|
178 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
179 |
+
}
|
180 |
+
static void f16_f32acc_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
181 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
|
182 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
183 |
+
}
|
184 |
+
static void f16_f32acc_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
185 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
|
186 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
187 |
+
}
|
188 |
+
static void f16_f32acc_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
189 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
|
190 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
191 |
+
}
|
192 |
+
static void f16_f32acc_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
193 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
|
194 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
195 |
+
}
|
196 |
+
static void f16_f32acc_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
197 |
+
f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
|
198 |
+
xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
|
199 |
+
}
|
200 |
+
|
201 |
+
BENCHMARK_CONV(f16_f32acc_igemm_1x8__avx2_broadcast)
|
202 |
+
BENCHMARK_CONV(f16_f32acc_igemm_4x8__avx2_broadcast)
|
203 |
+
BENCHMARK_CONV(f16_f32acc_igemm_5x8__avx2_broadcast)
|
204 |
+
BENCHMARK_CONV(f16_f32acc_igemm_6x8__avx2_broadcast)
|
205 |
+
BENCHMARK_CONV(f16_f32acc_igemm_7x8__avx2_broadcast)
|
206 |
+
BENCHMARK_CONV(f16_f32acc_igemm_1x16__avx2_broadcast)
|
207 |
+
BENCHMARK_CONV(f16_f32acc_igemm_3x16__avx2_broadcast)
|
208 |
+
BENCHMARK_CONV(f16_f32acc_igemm_4x16__avx2_broadcast)
|
209 |
+
BENCHMARK_CONV(f16_f32acc_igemm_5x16__avx2_broadcast)
|
210 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
211 |
+
|
212 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
213 |
+
BENCHMARK_MAIN();
|
214 |
+
#endif
|
bench/f16-f32acc-rsum.cc
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/reduce.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_f32acc_rsum(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_f32acc_rsum_ukernel_fn rsum,
|
27 |
+
xnn_init_f16_f32acc_scale_params_fn init_params,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
|
42 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
43 |
+
|
44 |
+
xnn_f16_f32acc_scale_params params;
|
45 |
+
init_params(¶ms, /*scale=*/0.1f);
|
46 |
+
|
47 |
+
uint16_t output = UINT16_C(0x7E00); /* NaN */
|
48 |
+
for (auto _ : state) {
|
49 |
+
rsum(elements * sizeof(uint16_t), input.data(), &output, ¶ms);
|
50 |
+
}
|
51 |
+
|
52 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
53 |
+
if (cpu_frequency != 0) {
|
54 |
+
state.counters["cpufreq"] = cpu_frequency;
|
55 |
+
}
|
56 |
+
|
57 |
+
const size_t elements_per_iteration = elements;
|
58 |
+
state.counters["elements"] =
|
59 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
60 |
+
|
61 |
+
const size_t bytes_per_iteration = elements * sizeof(uint16_t);
|
62 |
+
state.counters["bytes"] =
|
63 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
64 |
+
}
|
65 |
+
|
66 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
67 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x4,
|
68 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x4,
|
69 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
70 |
+
benchmark::utils::CheckNEONFP16)
|
71 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
72 |
+
->UseRealTime();
|
73 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x8,
|
74 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x8,
|
75 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
76 |
+
benchmark::utils::CheckNEONFP16)
|
77 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
78 |
+
->UseRealTime();
|
79 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x16_acc2,
|
80 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x16_acc2,
|
81 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
82 |
+
benchmark::utils::CheckNEONFP16)
|
83 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
84 |
+
->UseRealTime();
|
85 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x24_acc3,
|
86 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x24_acc3,
|
87 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
88 |
+
benchmark::utils::CheckNEONFP16)
|
89 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
90 |
+
->UseRealTime();
|
91 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc2,
|
92 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc2,
|
93 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
94 |
+
benchmark::utils::CheckNEONFP16)
|
95 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
96 |
+
->UseRealTime();
|
97 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc4,
|
98 |
+
xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc4,
|
99 |
+
xnn_init_f16_f32acc_scale_scalar_params,
|
100 |
+
benchmark::utils::CheckNEONFP16)
|
101 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
102 |
+
->UseRealTime();
|
103 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
104 |
+
|
105 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
106 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x8,
|
107 |
+
xnn_f16_f32acc_rsum_ukernel__f16c_x8,
|
108 |
+
xnn_init_f16_f32acc_scale_avx_params,
|
109 |
+
benchmark::utils::CheckF16C)
|
110 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
111 |
+
->UseRealTime();
|
112 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x16_acc2,
|
113 |
+
xnn_f16_f32acc_rsum_ukernel__f16c_x16_acc2,
|
114 |
+
xnn_init_f16_f32acc_scale_avx_params,
|
115 |
+
benchmark::utils::CheckF16C)
|
116 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
117 |
+
->UseRealTime();
|
118 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x24_acc3,
|
119 |
+
xnn_f16_f32acc_rsum_ukernel__f16c_x24_acc3,
|
120 |
+
xnn_init_f16_f32acc_scale_avx_params,
|
121 |
+
benchmark::utils::CheckF16C)
|
122 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
123 |
+
->UseRealTime();
|
124 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc2,
|
125 |
+
xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc2,
|
126 |
+
xnn_init_f16_f32acc_scale_avx_params,
|
127 |
+
benchmark::utils::CheckF16C)
|
128 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
129 |
+
->UseRealTime();
|
130 |
+
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc4,
|
131 |
+
xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc4,
|
132 |
+
xnn_init_f16_f32acc_scale_avx_params,
|
133 |
+
benchmark::utils::CheckF16C)
|
134 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
135 |
+
->UseRealTime();
|
136 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
137 |
+
|
138 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
139 |
+
BENCHMARK_MAIN();
|
140 |
+
#endif
|
bench/f16-gavgpool-cw.cc
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <numeric>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include "bench/utils.h"
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
|
15 |
+
#include <xnnpack.h>
|
16 |
+
#include <xnnpack/aligned-allocator.h>
|
17 |
+
#include <xnnpack/common.h>
|
18 |
+
#include <xnnpack/gavgpool.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
|
22 |
+
|
23 |
+
void f16_gavgpool_cw(
|
24 |
+
benchmark::State& state,
|
25 |
+
xnn_f16_gavgpool_cw_ukernel_fn gavgpool_cw,
|
26 |
+
xnn_init_f16_gavgpool_neonfp16arith_params_fn init_params,
|
27 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
28 |
+
{
|
29 |
+
if (isa_check && !isa_check(state)) {
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
const size_t channels = state.range(0);
|
33 |
+
const size_t elements = state.range(1);
|
34 |
+
|
35 |
+
std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(elements * channels + XNN_EXTRA_BYTES / sizeof(int16_t));
|
36 |
+
std::vector<int16_t> output(channels);
|
37 |
+
std::iota(input.begin(), input.end(), 0);
|
38 |
+
|
39 |
+
// Prepare parameters.
|
40 |
+
union xnn_f16_gavgpool_params params;
|
41 |
+
init_params(¶ms,
|
42 |
+
UINT16_C(0x3C00) /* scale */, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */, elements);
|
43 |
+
|
44 |
+
for (auto _ : state) {
|
45 |
+
gavgpool_cw(elements, channels, input.data(), output.data(), ¶ms);
|
46 |
+
}
|
47 |
+
|
48 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
49 |
+
if (cpu_frequency != 0) {
|
50 |
+
state.counters["cpufreq"] = cpu_frequency;
|
51 |
+
}
|
52 |
+
}
|
53 |
+
|
54 |
+
static void BenchmarkBatch(benchmark::internal::Benchmark* b)
|
55 |
+
{
|
56 |
+
b->ArgNames({"channels", "elements"});
|
57 |
+
b->Args({1, 1024});
|
58 |
+
b->Args({2, 1024});
|
59 |
+
b->Args({4, 1024});
|
60 |
+
b->Args({6, 1024});
|
61 |
+
b->Args({8, 1024});
|
62 |
+
b->Args({16, 1024});
|
63 |
+
b->Args({1024, 1024});
|
64 |
+
}
|
65 |
+
|
66 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
67 |
+
BENCHMARK_CAPTURE(f16_gavgpool_cw, f16_neon_x8,
|
68 |
+
xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8,
|
69 |
+
xnn_init_f16_gavgpool_neonfp16arith_params,
|
70 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
71 |
+
->Apply(BenchmarkBatch)
|
72 |
+
->UseRealTime();
|
73 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
74 |
+
|
75 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
76 |
+
BENCHMARK_MAIN();
|
77 |
+
#endif
|
bench/f16-gemm-e2e.cc
ADDED
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstring>
|
9 |
+
#include <functional>
|
10 |
+
#include <memory>
|
11 |
+
#include <random>
|
12 |
+
#include <vector>
|
13 |
+
|
14 |
+
#include "bench/end2end.h"
|
15 |
+
#include "bench/utils.h"
|
16 |
+
#include <benchmark/benchmark.h>
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/config.h>
|
20 |
+
#include <xnnpack/gemm.h>
|
21 |
+
#include <xnnpack/igemm.h>
|
22 |
+
#include <xnnpack/microfnptr.h>
|
23 |
+
#include <xnnpack/microparams-init.h>
|
24 |
+
#include <xnnpack/models.h>
|
25 |
+
#include <xnnpack/pack.h>
|
26 |
+
|
27 |
+
|
28 |
+
static void GEMMEnd2EndBenchmark(
|
29 |
+
benchmark::State& state,
|
30 |
+
models::ExecutionPlanFactory model_factory,
|
31 |
+
xnn_f16_gemm_minmax_ukernel_fn gemm_minmax,
|
32 |
+
xnn_f16_igemm_minmax_ukernel_fn igemm_minmax,
|
33 |
+
xnn_f16_gemm_minmax_ukernel_fn gemm1_minmax,
|
34 |
+
xnn_f16_igemm_minmax_ukernel_fn igemm1_minmax,
|
35 |
+
xnn_init_f16_minmax_params_fn init_params,
|
36 |
+
uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
|
37 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
38 |
+
{
|
39 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
40 |
+
return;
|
41 |
+
}
|
42 |
+
if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
|
43 |
+
state.SkipWithError("failed to initialize XNNPACK");
|
44 |
+
return;
|
45 |
+
}
|
46 |
+
|
47 |
+
struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config();
|
48 |
+
if (gemm_config == nullptr) {
|
49 |
+
state.SkipWithError("hardware does not support F16 gemm");
|
50 |
+
return;
|
51 |
+
}
|
52 |
+
|
53 |
+
// Override microkernels chosen in xnn_initialize
|
54 |
+
std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config));
|
55 |
+
gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_minmax));
|
56 |
+
gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_minmax));
|
57 |
+
gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_minmax));
|
58 |
+
gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_minmax));
|
59 |
+
gemm_config->init.f16 = init_params;
|
60 |
+
gemm_config->mr = mr;
|
61 |
+
gemm_config->nr = nr;
|
62 |
+
gemm_config->log2_kr = log2_kr;
|
63 |
+
gemm_config->log2_sr = log2_sr;
|
64 |
+
gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f16_gemm_goi_w;
|
65 |
+
|
66 |
+
auto execution_plan = model_factory(nullptr);
|
67 |
+
if (execution_plan.empty()) {
|
68 |
+
state.SkipWithError("failed to create a model");
|
69 |
+
return;
|
70 |
+
}
|
71 |
+
|
72 |
+
for (auto _ : state) {
|
73 |
+
for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
|
74 |
+
xnn_status status = xnn_run_operator(op.get(), nullptr);
|
75 |
+
if (status != xnn_status_success) {
|
76 |
+
state.SkipWithError("failed to run a model");
|
77 |
+
return;
|
78 |
+
}
|
79 |
+
}
|
80 |
+
}
|
81 |
+
|
82 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
83 |
+
if (cpu_frequency != 0) {
|
84 |
+
state.counters["cpufreq"] = cpu_frequency;
|
85 |
+
}
|
86 |
+
}
|
87 |
+
|
88 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
|
89 |
+
static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
90 |
+
GEMMEnd2EndBenchmark(state, model,
|
91 |
+
xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
|
92 |
+
xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
|
93 |
+
xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
|
94 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
95 |
+
xnn_init_f16_minmax_fp16arith_params,
|
96 |
+
4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
97 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
98 |
+
}
|
99 |
+
|
100 |
+
static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
101 |
+
GEMMEnd2EndBenchmark(state, model,
|
102 |
+
xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
|
103 |
+
xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
|
104 |
+
xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
|
105 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
106 |
+
xnn_init_f16_minmax_fp16arith_params,
|
107 |
+
6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
108 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
109 |
+
}
|
110 |
+
|
111 |
+
static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
112 |
+
GEMMEnd2EndBenchmark(state, model,
|
113 |
+
xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
|
114 |
+
xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
|
115 |
+
xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
|
116 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
117 |
+
xnn_init_f16_minmax_fp16arith_params,
|
118 |
+
8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
119 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
120 |
+
}
|
121 |
+
|
122 |
+
static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
|
123 |
+
GEMMEnd2EndBenchmark(state, model,
|
124 |
+
xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
|
125 |
+
xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
|
126 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
127 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
128 |
+
xnn_init_f16_minmax_fp16arith_params,
|
129 |
+
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
130 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
131 |
+
}
|
132 |
+
|
133 |
+
static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
134 |
+
GEMMEnd2EndBenchmark(state, model,
|
135 |
+
xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
|
136 |
+
xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
|
137 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
138 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
139 |
+
xnn_init_f16_minmax_fp16arith_params,
|
140 |
+
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
141 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
142 |
+
}
|
143 |
+
|
144 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
|
145 |
+
GEMMEnd2EndBenchmark(state, model,
|
146 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
|
147 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
|
148 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
149 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
150 |
+
xnn_init_f16_minmax_fp16arith_params,
|
151 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
152 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
153 |
+
}
|
154 |
+
|
155 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
156 |
+
GEMMEnd2EndBenchmark(state, model,
|
157 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
|
158 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
|
159 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
160 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
161 |
+
xnn_init_f16_minmax_fp16arith_params,
|
162 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
163 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
164 |
+
}
|
165 |
+
|
166 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
|
167 |
+
GEMMEnd2EndBenchmark(state, model,
|
168 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
|
169 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
|
170 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
171 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
172 |
+
xnn_init_f16_minmax_fp16arith_params,
|
173 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
174 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
175 |
+
}
|
176 |
+
|
177 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, models::ExecutionPlanFactory model) {
|
178 |
+
GEMMEnd2EndBenchmark(state, model,
|
179 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
|
180 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
|
181 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
182 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
183 |
+
xnn_init_f16_minmax_fp16arith_params,
|
184 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
185 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
186 |
+
}
|
187 |
+
|
188 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
|
189 |
+
GEMMEnd2EndBenchmark(state, model,
|
190 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
|
191 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
|
192 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
193 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
194 |
+
xnn_init_f16_minmax_fp16arith_params,
|
195 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
196 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
197 |
+
}
|
198 |
+
|
199 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64);
|
200 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64);
|
201 |
+
BENCHMARK_FP16_END2END(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64);
|
202 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32);
|
203 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64);
|
204 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32);
|
205 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64);
|
206 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55);
|
207 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
|
208 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75);
|
209 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
|
210 |
+
|
211 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
212 |
+
static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
213 |
+
GEMMEnd2EndBenchmark(state, model,
|
214 |
+
xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
|
215 |
+
xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
|
216 |
+
xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
217 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
218 |
+
xnn_init_f16_minmax_fp16arith_params,
|
219 |
+
4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
220 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
221 |
+
}
|
222 |
+
|
223 |
+
static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
224 |
+
GEMMEnd2EndBenchmark(state, model,
|
225 |
+
xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
|
226 |
+
xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
|
227 |
+
xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
228 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
229 |
+
xnn_init_f16_minmax_fp16arith_params,
|
230 |
+
6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
231 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
232 |
+
}
|
233 |
+
|
234 |
+
static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
235 |
+
GEMMEnd2EndBenchmark(state, model,
|
236 |
+
xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
|
237 |
+
xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
|
238 |
+
xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
239 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
240 |
+
xnn_init_f16_minmax_fp16arith_params,
|
241 |
+
8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
242 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
243 |
+
}
|
244 |
+
|
245 |
+
static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
246 |
+
GEMMEnd2EndBenchmark(state, model,
|
247 |
+
xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
|
248 |
+
xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
|
249 |
+
xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
250 |
+
xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
251 |
+
xnn_init_f16_minmax_fp16arith_params,
|
252 |
+
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
253 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
254 |
+
}
|
255 |
+
|
256 |
+
static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
257 |
+
GEMMEnd2EndBenchmark(state, model,
|
258 |
+
xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
|
259 |
+
xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
|
260 |
+
xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
261 |
+
xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
262 |
+
xnn_init_f16_minmax_fp16arith_params,
|
263 |
+
6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
264 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
265 |
+
}
|
266 |
+
|
267 |
+
static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
|
268 |
+
GEMMEnd2EndBenchmark(state, model,
|
269 |
+
xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
|
270 |
+
xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
|
271 |
+
xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
272 |
+
xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
273 |
+
xnn_init_f16_minmax_fp16arith_params,
|
274 |
+
8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
275 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
276 |
+
}
|
277 |
+
|
278 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x8__neonfp16arith_ld64);
|
279 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x8__neonfp16arith_ld64);
|
280 |
+
BENCHMARK_FP16_END2END(f16_gemm_8x8__neonfp16arith_ld64);
|
281 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x16__neonfp16arith_ld64);
|
282 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x16__neonfp16arith_ld64);
|
283 |
+
BENCHMARK_FP16_END2END(f16_gemm_8x16__neonfp16arith_ld64);
|
284 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
285 |
+
|
286 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
287 |
+
static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
288 |
+
GEMMEnd2EndBenchmark(state, model,
|
289 |
+
xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
|
290 |
+
xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
|
291 |
+
xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
292 |
+
xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
293 |
+
xnn_init_f16_minmax_avx_params,
|
294 |
+
4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
295 |
+
benchmark::utils::CheckAVX2);
|
296 |
+
}
|
297 |
+
static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
298 |
+
GEMMEnd2EndBenchmark(state, model,
|
299 |
+
xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
|
300 |
+
xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
|
301 |
+
xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
302 |
+
xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
303 |
+
xnn_init_f16_minmax_avx_params,
|
304 |
+
5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
305 |
+
benchmark::utils::CheckAVX2);
|
306 |
+
}
|
307 |
+
static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
308 |
+
GEMMEnd2EndBenchmark(state, model,
|
309 |
+
xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
|
310 |
+
xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
|
311 |
+
xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
312 |
+
xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
313 |
+
xnn_init_f16_minmax_avx_params,
|
314 |
+
6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
315 |
+
benchmark::utils::CheckAVX2);
|
316 |
+
}
|
317 |
+
static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
318 |
+
GEMMEnd2EndBenchmark(state, model,
|
319 |
+
xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
|
320 |
+
xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
|
321 |
+
xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
322 |
+
xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
323 |
+
xnn_init_f16_minmax_avx_params,
|
324 |
+
7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
325 |
+
benchmark::utils::CheckAVX2);
|
326 |
+
}
|
327 |
+
|
328 |
+
static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
329 |
+
GEMMEnd2EndBenchmark(state, model,
|
330 |
+
xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
|
331 |
+
xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
|
332 |
+
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
333 |
+
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
334 |
+
xnn_init_f16_minmax_avx_params,
|
335 |
+
3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
336 |
+
benchmark::utils::CheckAVX2);
|
337 |
+
}
|
338 |
+
static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
339 |
+
GEMMEnd2EndBenchmark(state, model,
|
340 |
+
xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
|
341 |
+
xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
|
342 |
+
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
343 |
+
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
344 |
+
xnn_init_f16_minmax_avx_params,
|
345 |
+
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
346 |
+
benchmark::utils::CheckAVX2);
|
347 |
+
}
|
348 |
+
static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
349 |
+
GEMMEnd2EndBenchmark(state, model,
|
350 |
+
xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
|
351 |
+
xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
|
352 |
+
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
353 |
+
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
354 |
+
xnn_init_f16_minmax_avx_params,
|
355 |
+
5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
356 |
+
benchmark::utils::CheckAVX2);
|
357 |
+
}
|
358 |
+
|
359 |
+
static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
360 |
+
GEMMEnd2EndBenchmark(state, model,
|
361 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast,
|
362 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast,
|
363 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
364 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
365 |
+
xnn_init_f16_minmax_avx_params,
|
366 |
+
4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
367 |
+
benchmark::utils::CheckAVX2);
|
368 |
+
}
|
369 |
+
static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
370 |
+
GEMMEnd2EndBenchmark(state, model,
|
371 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast,
|
372 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast,
|
373 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
374 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
375 |
+
xnn_init_f16_minmax_avx_params,
|
376 |
+
5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
377 |
+
benchmark::utils::CheckAVX2);
|
378 |
+
}
|
379 |
+
static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
380 |
+
GEMMEnd2EndBenchmark(state, model,
|
381 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast,
|
382 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast,
|
383 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
384 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
385 |
+
xnn_init_f16_minmax_avx_params,
|
386 |
+
6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
387 |
+
benchmark::utils::CheckAVX2);
|
388 |
+
}
|
389 |
+
static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
390 |
+
GEMMEnd2EndBenchmark(state, model,
|
391 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast,
|
392 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast,
|
393 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
394 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
395 |
+
xnn_init_f16_minmax_avx_params,
|
396 |
+
7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
397 |
+
benchmark::utils::CheckAVX2);
|
398 |
+
}
|
399 |
+
|
400 |
+
static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
401 |
+
GEMMEnd2EndBenchmark(state, model,
|
402 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast,
|
403 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast,
|
404 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
405 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
406 |
+
xnn_init_f16_minmax_avx_params,
|
407 |
+
3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
408 |
+
benchmark::utils::CheckAVX2);
|
409 |
+
}
|
410 |
+
static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
411 |
+
GEMMEnd2EndBenchmark(state, model,
|
412 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast,
|
413 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast,
|
414 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
415 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
416 |
+
xnn_init_f16_minmax_avx_params,
|
417 |
+
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
418 |
+
benchmark::utils::CheckAVX2);
|
419 |
+
}
|
420 |
+
static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
|
421 |
+
GEMMEnd2EndBenchmark(state, model,
|
422 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast,
|
423 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast,
|
424 |
+
xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
425 |
+
xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
426 |
+
xnn_init_f16_minmax_avx_params,
|
427 |
+
5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
|
428 |
+
benchmark::utils::CheckAVX2);
|
429 |
+
}
|
430 |
+
|
431 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x8__avx2_broadcast);
|
432 |
+
BENCHMARK_FP16_END2END(f16_gemm_5x8__avx2_broadcast);
|
433 |
+
BENCHMARK_FP16_END2END(f16_gemm_6x8__avx2_broadcast);
|
434 |
+
BENCHMARK_FP16_END2END(f16_gemm_7x8__avx2_broadcast);
|
435 |
+
|
436 |
+
BENCHMARK_FP16_END2END(f16_gemm_3x16__avx2_broadcast);
|
437 |
+
BENCHMARK_FP16_END2END(f16_gemm_4x16__avx2_broadcast);
|
438 |
+
BENCHMARK_FP16_END2END(f16_gemm_5x16__avx2_broadcast);
|
439 |
+
|
440 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x8__avx2_broadcast);
|
441 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x8__avx2_broadcast);
|
442 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_6x8__avx2_broadcast);
|
443 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_7x8__avx2_broadcast);
|
444 |
+
|
445 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_3x16__avx2_broadcast);
|
446 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x16__avx2_broadcast);
|
447 |
+
BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x16__avx2_broadcast);
|
448 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
449 |
+
|
450 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
451 |
+
BENCHMARK_MAIN();
|
452 |
+
#endif
|
bench/f16-gemm.cc
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
// All rights reserved.
|
3 |
+
//
|
4 |
+
// Copyright 2019 Google LLC
|
5 |
+
//
|
6 |
+
// This source code is licensed under the BSD-style license found in the
|
7 |
+
// LICENSE file in the root directory of this source tree.
|
8 |
+
|
9 |
+
#include <algorithm>
|
10 |
+
#include <cfloat>
|
11 |
+
#include <cmath>
|
12 |
+
#include <functional>
|
13 |
+
#include <random>
|
14 |
+
#include <vector>
|
15 |
+
|
16 |
+
#include <benchmark/benchmark.h>
|
17 |
+
#include <fp16/fp16.h>
|
18 |
+
#include "bench/gemm.h"
|
19 |
+
#include "bench/utils.h"
|
20 |
+
|
21 |
+
#include <xnnpack.h>
|
22 |
+
#include <xnnpack/aligned-allocator.h>
|
23 |
+
#include <xnnpack/common.h>
|
24 |
+
#include <xnnpack/gemm.h>
|
25 |
+
#include <xnnpack/math.h>
|
26 |
+
#include <xnnpack/pack.h>
|
27 |
+
#include <xnnpack/microfnptr.h>
|
28 |
+
#include <xnnpack/microparams-init.h>
|
29 |
+
|
30 |
+
|
31 |
+
static void f16_gemm(benchmark::State& state,
|
32 |
+
xnn_f16_gemm_minmax_ukernel_fn gemm,
|
33 |
+
xnn_init_f16_minmax_params_fn init_params,
|
34 |
+
size_t mr, size_t nr, size_t kr, size_t sr,
|
35 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
36 |
+
{
|
37 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
38 |
+
return;
|
39 |
+
}
|
40 |
+
|
41 |
+
const size_t mc = state.range(0);
|
42 |
+
const size_t nc = state.range(1);
|
43 |
+
const size_t kc = state.range(2);
|
44 |
+
|
45 |
+
const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
|
46 |
+
const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
|
47 |
+
|
48 |
+
std::random_device random_device;
|
49 |
+
auto rng = std::mt19937(random_device());
|
50 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
51 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
52 |
+
|
53 |
+
std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
54 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
55 |
+
std::vector<uint16_t> k(nc * kc);
|
56 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
57 |
+
std::vector<uint16_t> b(nc);
|
58 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
59 |
+
|
60 |
+
const size_t w_elements = nc_stride * kc_stride + nc_stride;
|
61 |
+
const size_t c_elements = mc * nc;
|
62 |
+
const size_t num_buffers = 1 +
|
63 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
64 |
+
sizeof(uint16_t) * (w_elements + c_elements));
|
65 |
+
|
66 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
67 |
+
std::fill(w.begin(), w.end(), 0);
|
68 |
+
xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
|
69 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
70 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
71 |
+
|
72 |
+
// Prepare minmax parameters.
|
73 |
+
xnn_f16_minmax_params params;
|
74 |
+
init_params(¶ms,
|
75 |
+
UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
76 |
+
|
77 |
+
size_t buffer_index = 0;
|
78 |
+
for (auto _ : state) {
|
79 |
+
// Use circular buffers (exceeding cache size) and prefetch to control cache state:
|
80 |
+
// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
|
81 |
+
// - W is not in cache (for any cache level)
|
82 |
+
// - C is not in cache (for any cache level)
|
83 |
+
state.PauseTiming();
|
84 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
85 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
86 |
+
state.ResumeTiming();
|
87 |
+
|
88 |
+
for (uint32_t m = 0; m < mc; m += mr) {
|
89 |
+
const uint32_t mb = min(mc - m, mr);
|
90 |
+
for (uint32_t n = 0; n < nc; n += nr) {
|
91 |
+
const uint32_t nb = min(nc - n, nr);
|
92 |
+
gemm(
|
93 |
+
mb, nb, kc * sizeof(uint16_t),
|
94 |
+
a.data() + m * kc, kc * sizeof(uint16_t),
|
95 |
+
w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
|
96 |
+
c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
|
97 |
+
¶ms);
|
98 |
+
}
|
99 |
+
}
|
100 |
+
}
|
101 |
+
|
102 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
103 |
+
if (cpu_frequency != 0) {
|
104 |
+
state.counters["cpufreq"] = cpu_frequency;
|
105 |
+
}
|
106 |
+
|
107 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
108 |
+
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
|
109 |
+
}
|
110 |
+
|
111 |
+
|
112 |
+
#if XNN_PLATFORM_JIT
|
113 |
+
static void f16_gemm(benchmark::State& state,
|
114 |
+
xnn_jit_gemm_code_generator_fn generator,
|
115 |
+
xnn_init_f16_minmax_params_fn init_params,
|
116 |
+
size_t mr, size_t nr, size_t kr, size_t sr,
|
117 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
118 |
+
{
|
119 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
120 |
+
return;
|
121 |
+
}
|
122 |
+
|
123 |
+
const size_t mc = state.range(0);
|
124 |
+
const size_t nc = state.range(1);
|
125 |
+
const size_t kc = state.range(2);
|
126 |
+
|
127 |
+
const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
|
128 |
+
const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
|
129 |
+
|
130 |
+
std::random_device random_device;
|
131 |
+
auto rng = std::mt19937(random_device());
|
132 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
133 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
134 |
+
|
135 |
+
std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
136 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
137 |
+
std::vector<uint16_t> k(nc * kc);
|
138 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
139 |
+
std::vector<uint16_t> b(nc);
|
140 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
141 |
+
|
142 |
+
const size_t w_elements = nc_stride * kc_stride + nc_stride;
|
143 |
+
const size_t c_elements = mc * nc;
|
144 |
+
const size_t num_buffers = 1 +
|
145 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
146 |
+
sizeof(uint16_t) * (w_elements + c_elements));
|
147 |
+
|
148 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
149 |
+
std::fill(w.begin(), w.end(), 0);
|
150 |
+
xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
|
151 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
152 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
153 |
+
|
154 |
+
// Prepare minmax parameters.
|
155 |
+
xnn_f16_minmax_params params;
|
156 |
+
init_params(¶ms,
|
157 |
+
UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
158 |
+
|
159 |
+
jit_gemm_params jit_params = {};
|
160 |
+
jit_params.f16_minmax.min = UINT16_C(0xFC00); /* -inf */
|
161 |
+
jit_params.f16_minmax.max = UINT16_C(0x7C00); /* inf */
|
162 |
+
|
163 |
+
xnn_code_buffer code_buffer;
|
164 |
+
xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
|
165 |
+
generator(&code_buffer, mr, nc % nr, kc * sizeof(float), &jit_params);
|
166 |
+
xnn_finalize_code_memory(&code_buffer);
|
167 |
+
xnn_f16_gemm_minmax_ukernel_fn gemm = reinterpret_cast<xnn_f16_gemm_minmax_ukernel_fn>(code_buffer.start);
|
168 |
+
|
169 |
+
size_t buffer_index = 0;
|
170 |
+
for (auto _ : state) {
|
171 |
+
// Use circular buffers (exceeding cache size) and prefetch to control cache state:
|
172 |
+
// - A is always in L1 cache (if fits, otherwise L2, L3, etc)
|
173 |
+
// - W is not in cache (for any cache level)
|
174 |
+
// - C is not in cache (for any cache level)
|
175 |
+
state.PauseTiming();
|
176 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
177 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
178 |
+
state.ResumeTiming();
|
179 |
+
|
180 |
+
for (uint32_t m = 0; m < mc; m += mr) {
|
181 |
+
const uint32_t mb = min(mc - m, mr);
|
182 |
+
for (uint32_t n = 0; n < nc; n += nr) {
|
183 |
+
const uint32_t nb = min(nc - n, nr);
|
184 |
+
gemm(
|
185 |
+
mb, nb, kc * sizeof(uint16_t),
|
186 |
+
a.data() + m * kc, kc * sizeof(uint16_t),
|
187 |
+
w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
|
188 |
+
c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
|
189 |
+
¶ms);
|
190 |
+
}
|
191 |
+
}
|
192 |
+
}
|
193 |
+
|
194 |
+
xnn_release_code_memory(&code_buffer);
|
195 |
+
|
196 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
197 |
+
if (cpu_frequency != 0) {
|
198 |
+
state.counters["cpufreq"] = cpu_frequency;
|
199 |
+
}
|
200 |
+
|
201 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
202 |
+
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
|
203 |
+
}
|
204 |
+
#endif // XNN_PLATFORM_JIT
|
205 |
+
|
206 |
+
|
207 |
+
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
|
208 |
+
static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
|
209 |
+
f16_gemm(state,
|
210 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
211 |
+
xnn_init_f16_minmax_fp16arith_params,
|
212 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
213 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
214 |
+
}
|
215 |
+
static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
216 |
+
f16_gemm(state,
|
217 |
+
xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
218 |
+
xnn_init_f16_minmax_fp16arith_params,
|
219 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
220 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
221 |
+
}
|
222 |
+
static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
|
223 |
+
f16_gemm(state,
|
224 |
+
xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
|
225 |
+
xnn_init_f16_minmax_fp16arith_params,
|
226 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
227 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
228 |
+
}
|
229 |
+
static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
230 |
+
f16_gemm(state,
|
231 |
+
xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
|
232 |
+
xnn_init_f16_minmax_fp16arith_params,
|
233 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
234 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
235 |
+
}
|
236 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
|
237 |
+
f16_gemm(state,
|
238 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
|
239 |
+
xnn_init_f16_minmax_fp16arith_params,
|
240 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
241 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
242 |
+
}
|
243 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
|
244 |
+
f16_gemm(state,
|
245 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
|
246 |
+
xnn_init_f16_minmax_fp16arith_params,
|
247 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
248 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
249 |
+
}
|
250 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
|
251 |
+
f16_gemm(state,
|
252 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
|
253 |
+
xnn_init_f16_minmax_fp16arith_params,
|
254 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
255 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
256 |
+
}
|
257 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
|
258 |
+
f16_gemm(state,
|
259 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
|
260 |
+
xnn_init_f16_minmax_fp16arith_params,
|
261 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
262 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
263 |
+
}
|
264 |
+
static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
265 |
+
f16_gemm(state,
|
266 |
+
xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
|
267 |
+
xnn_init_f16_minmax_fp16arith_params,
|
268 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
269 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
270 |
+
}
|
271 |
+
static void f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
272 |
+
f16_gemm(state,
|
273 |
+
xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
|
274 |
+
xnn_init_f16_minmax_fp16arith_params,
|
275 |
+
/*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
276 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
277 |
+
}
|
278 |
+
static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
279 |
+
f16_gemm(state,
|
280 |
+
xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
|
281 |
+
xnn_init_f16_minmax_fp16arith_params,
|
282 |
+
/*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
283 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
284 |
+
}
|
285 |
+
static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
286 |
+
f16_gemm(state,
|
287 |
+
xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
|
288 |
+
xnn_init_f16_minmax_fp16arith_params,
|
289 |
+
/*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
290 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
291 |
+
}
|
292 |
+
static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
293 |
+
f16_gemm(state,
|
294 |
+
xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
|
295 |
+
xnn_init_f16_minmax_fp16arith_params,
|
296 |
+
/*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
297 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
298 |
+
}
|
299 |
+
|
300 |
+
BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32)
|
301 |
+
BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64)
|
302 |
+
BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32)
|
303 |
+
BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64)
|
304 |
+
BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
|
305 |
+
BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
|
306 |
+
BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
|
307 |
+
BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32)
|
308 |
+
BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64)
|
309 |
+
BENCHMARK_GEMM(f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64)
|
310 |
+
BENCHMARK_GEMM(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64)
|
311 |
+
BENCHMARK_GEMM(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64)
|
312 |
+
BENCHMARK_GEMM(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64)
|
313 |
+
#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
|
314 |
+
|
315 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
316 |
+
static void f16_gemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
317 |
+
f16_gemm(state,
|
318 |
+
xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
319 |
+
xnn_init_f16_minmax_fp16arith_params,
|
320 |
+
/*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
321 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
322 |
+
}
|
323 |
+
static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
324 |
+
f16_gemm(state,
|
325 |
+
xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
|
326 |
+
xnn_init_f16_minmax_fp16arith_params,
|
327 |
+
/*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
328 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
329 |
+
}
|
330 |
+
static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
331 |
+
f16_gemm(state,
|
332 |
+
xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
|
333 |
+
xnn_init_f16_minmax_fp16arith_params,
|
334 |
+
/*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
335 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
336 |
+
}
|
337 |
+
static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
338 |
+
f16_gemm(state,
|
339 |
+
xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
|
340 |
+
xnn_init_f16_minmax_fp16arith_params,
|
341 |
+
/*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
342 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
343 |
+
}
|
344 |
+
static void f16_gemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
345 |
+
f16_gemm(state,
|
346 |
+
xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
347 |
+
xnn_init_f16_minmax_fp16arith_params,
|
348 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
349 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
350 |
+
}
|
351 |
+
static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
352 |
+
f16_gemm(state,
|
353 |
+
xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
|
354 |
+
xnn_init_f16_minmax_fp16arith_params,
|
355 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
356 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
357 |
+
}
|
358 |
+
static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
359 |
+
f16_gemm(state,
|
360 |
+
xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
|
361 |
+
xnn_init_f16_minmax_fp16arith_params,
|
362 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
363 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
364 |
+
}
|
365 |
+
static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
366 |
+
f16_gemm(state,
|
367 |
+
xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
|
368 |
+
xnn_init_f16_minmax_fp16arith_params,
|
369 |
+
/*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
370 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
371 |
+
}
|
372 |
+
|
373 |
+
BENCHMARK_GEMM(f16_gemm_1x8__neonfp16arith_ld64)
|
374 |
+
BENCHMARK_GEMM(f16_gemm_4x8__neonfp16arith_ld64)
|
375 |
+
BENCHMARK_GEMM(f16_gemm_6x8__neonfp16arith_ld64)
|
376 |
+
BENCHMARK_GEMM(f16_gemm_8x8__neonfp16arith_ld64)
|
377 |
+
BENCHMARK_GEMM(f16_gemm_1x16__neonfp16arith_ld64)
|
378 |
+
BENCHMARK_GEMM(f16_gemm_4x16__neonfp16arith_ld64)
|
379 |
+
BENCHMARK_GEMM(f16_gemm_6x16__neonfp16arith_ld64)
|
380 |
+
BENCHMARK_GEMM(f16_gemm_8x16__neonfp16arith_ld64)
|
381 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
382 |
+
|
383 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
384 |
+
static void f16_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
385 |
+
f16_gemm(state,
|
386 |
+
xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
|
387 |
+
xnn_init_f16_minmax_avx_params,
|
388 |
+
/*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
389 |
+
benchmark::utils::CheckAVX2);
|
390 |
+
}
|
391 |
+
static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
392 |
+
f16_gemm(state,
|
393 |
+
xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
|
394 |
+
xnn_init_f16_minmax_avx_params,
|
395 |
+
/*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
396 |
+
benchmark::utils::CheckAVX2);
|
397 |
+
}
|
398 |
+
static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
399 |
+
f16_gemm(state,
|
400 |
+
xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
|
401 |
+
xnn_init_f16_minmax_avx_params,
|
402 |
+
/*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
403 |
+
benchmark::utils::CheckAVX2);
|
404 |
+
}
|
405 |
+
static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
406 |
+
f16_gemm(state,
|
407 |
+
xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
|
408 |
+
xnn_init_f16_minmax_avx_params,
|
409 |
+
/*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
410 |
+
benchmark::utils::CheckAVX2);
|
411 |
+
}
|
412 |
+
static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
413 |
+
f16_gemm(state,
|
414 |
+
xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
|
415 |
+
xnn_init_f16_minmax_avx_params,
|
416 |
+
/*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
417 |
+
benchmark::utils::CheckAVX2);
|
418 |
+
}
|
419 |
+
static void f16_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
420 |
+
f16_gemm(state,
|
421 |
+
xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
|
422 |
+
xnn_init_f16_minmax_avx_params,
|
423 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
424 |
+
benchmark::utils::CheckAVX2);
|
425 |
+
}
|
426 |
+
static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
427 |
+
f16_gemm(state,
|
428 |
+
xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
|
429 |
+
xnn_init_f16_minmax_avx_params,
|
430 |
+
/*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
431 |
+
benchmark::utils::CheckAVX2);
|
432 |
+
}
|
433 |
+
static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
434 |
+
f16_gemm(state,
|
435 |
+
xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
|
436 |
+
xnn_init_f16_minmax_avx_params,
|
437 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
438 |
+
benchmark::utils::CheckAVX2);
|
439 |
+
}
|
440 |
+
static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
441 |
+
f16_gemm(state,
|
442 |
+
xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
|
443 |
+
xnn_init_f16_minmax_avx_params,
|
444 |
+
/*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
445 |
+
benchmark::utils::CheckAVX2);
|
446 |
+
}
|
447 |
+
|
448 |
+
BENCHMARK_GEMM(f16_gemm_1x8__avx2_broadcast)
|
449 |
+
BENCHMARK_GEMM(f16_gemm_4x8__avx2_broadcast)
|
450 |
+
BENCHMARK_GEMM(f16_gemm_5x8__avx2_broadcast)
|
451 |
+
BENCHMARK_GEMM(f16_gemm_6x8__avx2_broadcast)
|
452 |
+
BENCHMARK_GEMM(f16_gemm_7x8__avx2_broadcast)
|
453 |
+
BENCHMARK_GEMM(f16_gemm_1x16__avx2_broadcast)
|
454 |
+
BENCHMARK_GEMM(f16_gemm_3x16__avx2_broadcast)
|
455 |
+
BENCHMARK_GEMM(f16_gemm_4x16__avx2_broadcast)
|
456 |
+
BENCHMARK_GEMM(f16_gemm_5x16__avx2_broadcast)
|
457 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
458 |
+
|
459 |
+
#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
|
460 |
+
static void f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
461 |
+
f16_gemm(state,
|
462 |
+
xnn_generate_f16_gemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
|
463 |
+
xnn_init_f16_minmax_fp16arith_params,
|
464 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
465 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
466 |
+
}
|
467 |
+
static void f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
468 |
+
f16_gemm(state,
|
469 |
+
xnn_generate_f16_gemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
|
470 |
+
xnn_init_f16_minmax_fp16arith_params,
|
471 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
472 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
473 |
+
}
|
474 |
+
static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
475 |
+
f16_gemm(state,
|
476 |
+
xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
|
477 |
+
xnn_init_f16_minmax_fp16arith_params,
|
478 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
479 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
480 |
+
}
|
481 |
+
static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
|
482 |
+
f16_gemm(state,
|
483 |
+
xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
|
484 |
+
xnn_init_f16_minmax_fp16arith_params,
|
485 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
486 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
487 |
+
}
|
488 |
+
static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
|
489 |
+
f16_gemm(state,
|
490 |
+
xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
|
491 |
+
xnn_init_f16_minmax_fp16arith_params,
|
492 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
493 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
494 |
+
}
|
495 |
+
static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
|
496 |
+
f16_gemm(state,
|
497 |
+
xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
|
498 |
+
xnn_init_f16_minmax_fp16arith_params,
|
499 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
500 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
501 |
+
}
|
502 |
+
|
503 |
+
BENCHMARK_GEMM(f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
|
504 |
+
BENCHMARK_GEMM(f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
|
505 |
+
BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
|
506 |
+
BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
|
507 |
+
BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
|
508 |
+
BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
|
509 |
+
#endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
|
510 |
+
|
511 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
512 |
+
BENCHMARK_MAIN();
|
513 |
+
#endif
|
bench/f16-igemm.cc
ADDED
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cfloat>
|
8 |
+
#include <cmath>
|
9 |
+
#include <functional>
|
10 |
+
#include <random>
|
11 |
+
#include <vector>
|
12 |
+
|
13 |
+
#include <benchmark/benchmark.h>
|
14 |
+
#include <fp16/fp16.h>
|
15 |
+
#include "bench/conv.h"
|
16 |
+
#include "bench/utils.h"
|
17 |
+
|
18 |
+
#include <xnnpack.h>
|
19 |
+
#include <xnnpack/aligned-allocator.h>
|
20 |
+
#include <xnnpack/common.h>
|
21 |
+
#include <xnnpack/igemm.h>
|
22 |
+
#include <xnnpack/indirection.h>
|
23 |
+
#include <xnnpack/microfnptr.h>
|
24 |
+
#include <xnnpack/microparams-init.h>
|
25 |
+
#include <xnnpack/operator.h>
|
26 |
+
#include <xnnpack/pack.h>
|
27 |
+
|
28 |
+
|
29 |
+
static void f16_igemm(benchmark::State& state,
|
30 |
+
xnn_f16_igemm_minmax_ukernel_fn igemm,
|
31 |
+
xnn_init_f16_minmax_params_fn init_params,
|
32 |
+
uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
|
33 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
34 |
+
{
|
35 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
36 |
+
return;
|
37 |
+
}
|
38 |
+
|
39 |
+
const size_t input_height = state.range(0);
|
40 |
+
const size_t input_width = state.range(1);
|
41 |
+
const size_t kernel_height = state.range(2);
|
42 |
+
const size_t kernel_width = state.range(3);
|
43 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
44 |
+
const size_t padding_height = state.range(4);
|
45 |
+
const size_t padding_width = state.range(5);
|
46 |
+
const size_t subsampling = state.range(6);
|
47 |
+
const size_t dilation = state.range(7);
|
48 |
+
const size_t group_input_channels = state.range(8);
|
49 |
+
const size_t group_output_channels = state.range(9);
|
50 |
+
|
51 |
+
std::random_device random_device;
|
52 |
+
auto rng = std::mt19937(random_device());
|
53 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
54 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
55 |
+
|
56 |
+
const size_t output_pixel_stride = group_output_channels;
|
57 |
+
const size_t input_pixel_stride = group_input_channels;
|
58 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
59 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
60 |
+
const size_t padding_left = padding_width / 2;
|
61 |
+
const size_t padding_top = padding_height / 2;
|
62 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
63 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
64 |
+
const size_t output_size = output_height * output_width;
|
65 |
+
|
66 |
+
const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
|
67 |
+
const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
|
68 |
+
const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
|
69 |
+
|
70 |
+
std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
71 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
72 |
+
std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
|
73 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
74 |
+
std::vector<uint16_t> b(group_output_channels);
|
75 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
76 |
+
|
77 |
+
std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
78 |
+
|
79 |
+
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
|
80 |
+
const size_t i_elements = mc_stride * kernel_size;
|
81 |
+
const size_t c_elements = output_height * output_width * output_pixel_stride;
|
82 |
+
const size_t num_buffers = 1 +
|
83 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
84 |
+
sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
|
85 |
+
|
86 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
87 |
+
std::fill(w.begin(), w.end(), 0);
|
88 |
+
xnn_pack_f16_conv_goki_w(
|
89 |
+
1 /* groups */, group_output_channels, kernel_size, group_input_channels,
|
90 |
+
nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
|
91 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
92 |
+
std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
|
93 |
+
}
|
94 |
+
|
95 |
+
std::vector<const uint16_t*> i(i_elements * num_buffers);
|
96 |
+
xnn_operator convolution_op = { };
|
97 |
+
convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
|
98 |
+
convolution_op.input = a.data();
|
99 |
+
convolution_op.input_pixel_stride = input_pixel_stride;
|
100 |
+
convolution_op.zero_buffer = z.data();
|
101 |
+
convolution_op.groups = 1;
|
102 |
+
convolution_op.group_input_channels = group_input_channels;
|
103 |
+
convolution_op.batch_size = 1;
|
104 |
+
convolution_op.input_height = input_height;
|
105 |
+
convolution_op.input_width = input_width;
|
106 |
+
convolution_op.output_height = output_height;
|
107 |
+
convolution_op.output_width = output_width;
|
108 |
+
convolution_op.kernel_height = kernel_height;
|
109 |
+
convolution_op.kernel_width = kernel_width;
|
110 |
+
convolution_op.stride_height = subsampling;
|
111 |
+
convolution_op.stride_width = subsampling;
|
112 |
+
convolution_op.dilation_height = dilation;
|
113 |
+
convolution_op.dilation_width = dilation;
|
114 |
+
convolution_op.padding_top = padding_top;
|
115 |
+
convolution_op.padding_left = padding_left;
|
116 |
+
xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
|
117 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
118 |
+
std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
|
119 |
+
}
|
120 |
+
|
121 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
122 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
123 |
+
|
124 |
+
// Prepare minmax parameters.
|
125 |
+
xnn_f16_minmax_params params;
|
126 |
+
init_params(¶ms, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
127 |
+
|
128 |
+
size_t buffer_index = 0;
|
129 |
+
for (auto _ : state) {
|
130 |
+
state.PauseTiming();
|
131 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
132 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
133 |
+
state.ResumeTiming();
|
134 |
+
|
135 |
+
for (uint32_t m = 0; m < output_size; m += mr) {
|
136 |
+
const uint32_t mb = min(output_size - m, mr);
|
137 |
+
for (uint32_t n = 0; n < group_output_channels; n += nr) {
|
138 |
+
const uint32_t nb = min(group_output_channels - n, nr);
|
139 |
+
igemm(
|
140 |
+
mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
|
141 |
+
reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
|
142 |
+
w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
|
143 |
+
c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
|
144 |
+
0, z.data(), ¶ms);
|
145 |
+
}
|
146 |
+
}
|
147 |
+
}
|
148 |
+
|
149 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
150 |
+
if (cpu_frequency != 0) {
|
151 |
+
state.counters["cpufreq"] = cpu_frequency;
|
152 |
+
}
|
153 |
+
|
154 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
155 |
+
uint64_t(state.iterations()) * 2 *
|
156 |
+
output_height * output_width *
|
157 |
+
group_input_channels * group_output_channels *
|
158 |
+
kernel_height * kernel_width,
|
159 |
+
benchmark::Counter::kIsRate);
|
160 |
+
}
|
161 |
+
|
162 |
+
#if XNN_PLATFORM_JIT
|
163 |
+
static void f16_igemm(benchmark::State& state,
|
164 |
+
xnn_jit_igemm_code_generator_fn generator,
|
165 |
+
xnn_init_f16_minmax_params_fn init_params,
|
166 |
+
uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
|
167 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
168 |
+
{
|
169 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
170 |
+
return;
|
171 |
+
}
|
172 |
+
|
173 |
+
const size_t input_height = state.range(0);
|
174 |
+
const size_t input_width = state.range(1);
|
175 |
+
const size_t kernel_height = state.range(2);
|
176 |
+
const size_t kernel_width = state.range(3);
|
177 |
+
const size_t kernel_size = kernel_height * kernel_width;
|
178 |
+
const size_t padding_height = state.range(4);
|
179 |
+
const size_t padding_width = state.range(5);
|
180 |
+
const size_t subsampling = state.range(6);
|
181 |
+
const size_t dilation = state.range(7);
|
182 |
+
const size_t group_input_channels = state.range(8);
|
183 |
+
const size_t group_output_channels = state.range(9);
|
184 |
+
|
185 |
+
std::random_device random_device;
|
186 |
+
auto rng = std::mt19937(random_device());
|
187 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
188 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
189 |
+
|
190 |
+
const size_t output_pixel_stride = group_output_channels;
|
191 |
+
const size_t input_pixel_stride = group_input_channels;
|
192 |
+
const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
|
193 |
+
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
|
194 |
+
const size_t padding_left = padding_width / 2;
|
195 |
+
const size_t padding_top = padding_height / 2;
|
196 |
+
const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
|
197 |
+
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
|
198 |
+
const size_t output_size = output_height * output_width;
|
199 |
+
|
200 |
+
const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
|
201 |
+
const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
|
202 |
+
const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
|
203 |
+
|
204 |
+
std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
205 |
+
std::generate(a.begin(), a.end(), std::ref(f16rng));
|
206 |
+
std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
|
207 |
+
std::generate(k.begin(), k.end(), std::ref(f16rng));
|
208 |
+
std::vector<uint16_t> b(group_output_channels);
|
209 |
+
std::generate(b.begin(), b.end(), std::ref(f16rng));
|
210 |
+
|
211 |
+
std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
|
212 |
+
|
213 |
+
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
|
214 |
+
const size_t i_elements = mc_stride * kernel_size;
|
215 |
+
const size_t c_elements = output_height * output_width * output_pixel_stride;
|
216 |
+
const size_t num_buffers = 1 +
|
217 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
|
218 |
+
sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
|
219 |
+
|
220 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
|
221 |
+
std::fill(w.begin(), w.end(), 0);
|
222 |
+
xnn_pack_f16_conv_goki_w(
|
223 |
+
1 /* groups */, group_output_channels, kernel_size, group_input_channels,
|
224 |
+
nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
|
225 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
226 |
+
std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
|
227 |
+
}
|
228 |
+
|
229 |
+
std::vector<const uint16_t*> i(i_elements * num_buffers);
|
230 |
+
xnn_operator convolution_op = { };
|
231 |
+
convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
|
232 |
+
convolution_op.input = a.data();
|
233 |
+
convolution_op.input_pixel_stride = input_pixel_stride;
|
234 |
+
convolution_op.zero_buffer = z.data();
|
235 |
+
convolution_op.groups = 1;
|
236 |
+
convolution_op.group_input_channels = group_input_channels;
|
237 |
+
convolution_op.batch_size = 1;
|
238 |
+
convolution_op.input_height = input_height;
|
239 |
+
convolution_op.input_width = input_width;
|
240 |
+
convolution_op.output_height = output_height;
|
241 |
+
convolution_op.output_width = output_width;
|
242 |
+
convolution_op.kernel_height = kernel_height;
|
243 |
+
convolution_op.kernel_width = kernel_width;
|
244 |
+
convolution_op.stride_height = subsampling;
|
245 |
+
convolution_op.stride_width = subsampling;
|
246 |
+
convolution_op.dilation_height = dilation;
|
247 |
+
convolution_op.dilation_width = dilation;
|
248 |
+
convolution_op.padding_top = padding_top;
|
249 |
+
convolution_op.padding_left = padding_left;
|
250 |
+
xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
|
251 |
+
for (size_t n = 1; n < num_buffers; n++) {
|
252 |
+
std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
|
253 |
+
}
|
254 |
+
|
255 |
+
std::vector<uint16_t> c(c_elements * num_buffers);
|
256 |
+
std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
|
257 |
+
|
258 |
+
// Prepare minmax parameters.
|
259 |
+
xnn_f16_minmax_params params;
|
260 |
+
init_params(¶ms, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
|
261 |
+
|
262 |
+
jit_gemm_params jit_params = {};
|
263 |
+
jit_params.f16_minmax.min = UINT16_C(0xFC00); /* -inf */
|
264 |
+
jit_params.f16_minmax.max = UINT16_C(0x7C00); /* inf */
|
265 |
+
|
266 |
+
xnn_code_buffer code_buffer;
|
267 |
+
xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
|
268 |
+
generator(&code_buffer,
|
269 |
+
mr,
|
270 |
+
group_output_channels % nr,
|
271 |
+
group_input_channels * sizeof(uint16_t),
|
272 |
+
kernel_size * mr * sizeof(void *),
|
273 |
+
&jit_params);
|
274 |
+
xnn_finalize_code_memory(&code_buffer);
|
275 |
+
auto igemm = reinterpret_cast<xnn_f16_igemm_minmax_ukernel_fn>(code_buffer.start);
|
276 |
+
|
277 |
+
size_t buffer_index = 0;
|
278 |
+
for (auto _ : state) {
|
279 |
+
state.PauseTiming();
|
280 |
+
benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
|
281 |
+
buffer_index = (buffer_index + 1) % num_buffers;
|
282 |
+
state.ResumeTiming();
|
283 |
+
|
284 |
+
for (uint32_t m = 0; m < output_size; m += mr) {
|
285 |
+
const uint32_t mb = min(output_size - m, mr);
|
286 |
+
for (uint32_t n = 0; n < group_output_channels; n += nr) {
|
287 |
+
const uint32_t nb = min(group_output_channels - n, nr);
|
288 |
+
igemm(
|
289 |
+
mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
|
290 |
+
reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
|
291 |
+
w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
|
292 |
+
c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
|
293 |
+
0, z.data(), ¶ms);
|
294 |
+
}
|
295 |
+
}
|
296 |
+
}
|
297 |
+
xnn_release_code_memory(&code_buffer);
|
298 |
+
|
299 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
300 |
+
if (cpu_frequency != 0) {
|
301 |
+
state.counters["cpufreq"] = cpu_frequency;
|
302 |
+
}
|
303 |
+
|
304 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
305 |
+
uint64_t(state.iterations()) * 2 *
|
306 |
+
output_height * output_width *
|
307 |
+
group_input_channels * group_output_channels *
|
308 |
+
kernel_height * kernel_width,
|
309 |
+
benchmark::Counter::kIsRate);
|
310 |
+
}
|
311 |
+
#endif // XNN_PLATFORM_JIT
|
312 |
+
|
313 |
+
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
|
314 |
+
static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
|
315 |
+
f16_igemm(state,
|
316 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
|
317 |
+
xnn_init_f16_minmax_fp16arith_params,
|
318 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
319 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
320 |
+
}
|
321 |
+
static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
|
322 |
+
f16_igemm(state,
|
323 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
|
324 |
+
xnn_init_f16_minmax_fp16arith_params,
|
325 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
326 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
327 |
+
}
|
328 |
+
static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
|
329 |
+
f16_igemm(state,
|
330 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
|
331 |
+
xnn_init_f16_minmax_fp16arith_params,
|
332 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
333 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
334 |
+
}
|
335 |
+
static void f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
336 |
+
f16_igemm(state,
|
337 |
+
xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
|
338 |
+
xnn_init_f16_minmax_fp16arith_params,
|
339 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
340 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
341 |
+
}
|
342 |
+
static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
|
343 |
+
f16_igemm(state,
|
344 |
+
xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
|
345 |
+
xnn_init_f16_minmax_fp16arith_params,
|
346 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
347 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
348 |
+
}
|
349 |
+
static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
350 |
+
f16_igemm(state,
|
351 |
+
xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
|
352 |
+
xnn_init_f16_minmax_fp16arith_params,
|
353 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
354 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
355 |
+
}
|
356 |
+
static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
|
357 |
+
f16_igemm(state,
|
358 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
|
359 |
+
xnn_init_f16_minmax_fp16arith_params,
|
360 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
361 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
362 |
+
}
|
363 |
+
static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
364 |
+
f16_igemm(state,
|
365 |
+
xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
|
366 |
+
xnn_init_f16_minmax_fp16arith_params,
|
367 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
368 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
369 |
+
}
|
370 |
+
|
371 |
+
BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
|
372 |
+
BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
|
373 |
+
BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
|
374 |
+
BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64)
|
375 |
+
BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32)
|
376 |
+
BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64)
|
377 |
+
BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32)
|
378 |
+
BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64)
|
379 |
+
#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
|
380 |
+
|
381 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
382 |
+
static void f16_igemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
383 |
+
f16_igemm(state,
|
384 |
+
xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
|
385 |
+
xnn_init_f16_minmax_fp16arith_params,
|
386 |
+
/*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
387 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
388 |
+
}
|
389 |
+
static void f16_igemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
390 |
+
f16_igemm(state,
|
391 |
+
xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
|
392 |
+
xnn_init_f16_minmax_fp16arith_params,
|
393 |
+
/*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
394 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
395 |
+
}
|
396 |
+
static void f16_igemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
397 |
+
f16_igemm(state,
|
398 |
+
xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
|
399 |
+
xnn_init_f16_minmax_fp16arith_params,
|
400 |
+
/*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
401 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
402 |
+
}
|
403 |
+
static void f16_igemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
404 |
+
f16_igemm(state,
|
405 |
+
xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
|
406 |
+
xnn_init_f16_minmax_fp16arith_params,
|
407 |
+
/*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
408 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
409 |
+
}
|
410 |
+
static void f16_igemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
411 |
+
f16_igemm(state,
|
412 |
+
xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
|
413 |
+
xnn_init_f16_minmax_fp16arith_params,
|
414 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
415 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
416 |
+
}
|
417 |
+
static void f16_igemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
418 |
+
f16_igemm(state,
|
419 |
+
xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
|
420 |
+
xnn_init_f16_minmax_fp16arith_params,
|
421 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
422 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
423 |
+
}
|
424 |
+
static void f16_igemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
425 |
+
f16_igemm(state,
|
426 |
+
xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
|
427 |
+
xnn_init_f16_minmax_fp16arith_params,
|
428 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
429 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
430 |
+
}
|
431 |
+
static void f16_igemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
432 |
+
f16_igemm(state,
|
433 |
+
xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
|
434 |
+
xnn_init_f16_minmax_fp16arith_params,
|
435 |
+
/*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
436 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
437 |
+
}
|
438 |
+
|
439 |
+
BENCHMARK_CONV(f16_igemm_1x8__neonfp16arith_ld64)
|
440 |
+
BENCHMARK_CONV(f16_igemm_4x8__neonfp16arith_ld64)
|
441 |
+
BENCHMARK_CONV(f16_igemm_6x8__neonfp16arith_ld64)
|
442 |
+
BENCHMARK_CONV(f16_igemm_8x8__neonfp16arith_ld64)
|
443 |
+
BENCHMARK_CONV(f16_igemm_1x16__neonfp16arith_ld64)
|
444 |
+
BENCHMARK_CONV(f16_igemm_4x16__neonfp16arith_ld64)
|
445 |
+
BENCHMARK_CONV(f16_igemm_6x16__neonfp16arith_ld64)
|
446 |
+
BENCHMARK_CONV(f16_igemm_8x16__neonfp16arith_ld64)
|
447 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
448 |
+
|
449 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
450 |
+
static void f16_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
451 |
+
f16_igemm(state,
|
452 |
+
xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
|
453 |
+
xnn_init_f16_minmax_avx_params,
|
454 |
+
/*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
455 |
+
benchmark::utils::CheckAVX2);
|
456 |
+
}
|
457 |
+
static void f16_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
458 |
+
f16_igemm(state,
|
459 |
+
xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
|
460 |
+
xnn_init_f16_minmax_avx_params,
|
461 |
+
/*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
462 |
+
benchmark::utils::CheckAVX2);
|
463 |
+
}
|
464 |
+
static void f16_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
465 |
+
f16_igemm(state,
|
466 |
+
xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
|
467 |
+
xnn_init_f16_minmax_avx_params,
|
468 |
+
/*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
469 |
+
benchmark::utils::CheckAVX2);
|
470 |
+
}
|
471 |
+
static void f16_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
472 |
+
f16_igemm(state,
|
473 |
+
xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
|
474 |
+
xnn_init_f16_minmax_avx_params,
|
475 |
+
/*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
476 |
+
benchmark::utils::CheckAVX2);
|
477 |
+
}
|
478 |
+
static void f16_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
|
479 |
+
f16_igemm(state,
|
480 |
+
xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
|
481 |
+
xnn_init_f16_minmax_avx_params,
|
482 |
+
/*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
|
483 |
+
benchmark::utils::CheckAVX2);
|
484 |
+
}
|
485 |
+
static void f16_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
486 |
+
f16_igemm(state,
|
487 |
+
xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
|
488 |
+
xnn_init_f16_minmax_avx_params,
|
489 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
490 |
+
benchmark::utils::CheckAVX2);
|
491 |
+
}
|
492 |
+
static void f16_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
493 |
+
f16_igemm(state,
|
494 |
+
xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
|
495 |
+
xnn_init_f16_minmax_avx_params,
|
496 |
+
/*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
497 |
+
benchmark::utils::CheckAVX2);
|
498 |
+
}
|
499 |
+
static void f16_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
500 |
+
f16_igemm(state,
|
501 |
+
xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
|
502 |
+
xnn_init_f16_minmax_avx_params,
|
503 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
504 |
+
benchmark::utils::CheckAVX2);
|
505 |
+
}
|
506 |
+
static void f16_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
|
507 |
+
f16_igemm(state,
|
508 |
+
xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
|
509 |
+
xnn_init_f16_minmax_avx_params,
|
510 |
+
/*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
511 |
+
benchmark::utils::CheckAVX2);
|
512 |
+
}
|
513 |
+
|
514 |
+
BENCHMARK_CONV(f16_igemm_1x8__avx2_broadcast)
|
515 |
+
BENCHMARK_CONV(f16_igemm_4x8__avx2_broadcast)
|
516 |
+
BENCHMARK_CONV(f16_igemm_5x8__avx2_broadcast)
|
517 |
+
BENCHMARK_CONV(f16_igemm_6x8__avx2_broadcast)
|
518 |
+
BENCHMARK_CONV(f16_igemm_7x8__avx2_broadcast)
|
519 |
+
BENCHMARK_CONV(f16_igemm_1x16__avx2_broadcast)
|
520 |
+
BENCHMARK_CONV(f16_igemm_3x16__avx2_broadcast)
|
521 |
+
BENCHMARK_CONV(f16_igemm_4x16__avx2_broadcast)
|
522 |
+
BENCHMARK_CONV(f16_igemm_5x16__avx2_broadcast)
|
523 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
524 |
+
|
525 |
+
#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
|
526 |
+
static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
|
527 |
+
f16_igemm(state,
|
528 |
+
xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
|
529 |
+
xnn_init_f16_minmax_fp16arith_params,
|
530 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
531 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
532 |
+
}
|
533 |
+
static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
|
534 |
+
f16_igemm(state,
|
535 |
+
xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
|
536 |
+
xnn_init_f16_minmax_fp16arith_params,
|
537 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
538 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
539 |
+
}
|
540 |
+
static void f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
|
541 |
+
f16_igemm(state,
|
542 |
+
xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
|
543 |
+
xnn_init_f16_minmax_fp16arith_params,
|
544 |
+
/*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
545 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
546 |
+
}
|
547 |
+
static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
|
548 |
+
f16_igemm(state,
|
549 |
+
xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
|
550 |
+
xnn_init_f16_minmax_fp16arith_params,
|
551 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
552 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
553 |
+
}
|
554 |
+
static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
555 |
+
f16_igemm(state,
|
556 |
+
xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
|
557 |
+
xnn_init_f16_minmax_fp16arith_params,
|
558 |
+
/*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
559 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
560 |
+
}
|
561 |
+
static void f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
562 |
+
f16_igemm(state,
|
563 |
+
xnn_generate_f16_igemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
|
564 |
+
xnn_init_f16_minmax_fp16arith_params,
|
565 |
+
/*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
566 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
567 |
+
}
|
568 |
+
static void f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
|
569 |
+
f16_igemm(state,
|
570 |
+
xnn_generate_f16_igemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
|
571 |
+
xnn_init_f16_minmax_fp16arith_params,
|
572 |
+
/*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
|
573 |
+
benchmark::utils::CheckNEONFP16ARITH);
|
574 |
+
}
|
575 |
+
|
576 |
+
BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
|
577 |
+
BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
|
578 |
+
BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
|
579 |
+
BENCHMARK_CONV(f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0)
|
580 |
+
BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
|
581 |
+
BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
|
582 |
+
BENCHMARK_CONV(f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
|
583 |
+
BENCHMARK_CONV(f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
|
584 |
+
#endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
|
585 |
+
|
586 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
587 |
+
BENCHMARK_MAIN();
|
588 |
+
#endif
|
bench/f16-raddstoreexpminusmax.cc
ADDED
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/raddstoreexpminusmax.h>
|
22 |
+
#include <xnnpack/rmax.h>
|
23 |
+
|
24 |
+
|
25 |
+
static void f16_raddstoreexpminusmax(
|
26 |
+
benchmark::State& state,
|
27 |
+
xnn_f16_rmax_ukernel_fn rmax,
|
28 |
+
xnn_f16_raddstoreexpminusmax_ukernel_fn raddstoreexpminusmax,
|
29 |
+
xnn_init_f16_expminus_params_fn init_params,
|
30 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
31 |
+
{
|
32 |
+
if (isa_check && !isa_check(state)) {
|
33 |
+
return;
|
34 |
+
}
|
35 |
+
|
36 |
+
const size_t elements = state.range(0);
|
37 |
+
const size_t cache_line_size_max = 128;
|
38 |
+
const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(uint16_t));
|
39 |
+
|
40 |
+
std::random_device random_device;
|
41 |
+
auto rng = std::mt19937(random_device());
|
42 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-100.0f, 100.0f), std::ref(rng));
|
43 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
44 |
+
|
45 |
+
const size_t num_buffers = 1 +
|
46 |
+
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(uint16_t));
|
47 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
|
48 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(packed_elements * num_buffers);
|
49 |
+
|
50 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
51 |
+
|
52 |
+
benchmark::utils::DisableDenormals();
|
53 |
+
|
54 |
+
xnn_f16_expminus_params params;
|
55 |
+
init_params(¶ms);
|
56 |
+
|
57 |
+
size_t buffer_index = 0;
|
58 |
+
for (auto _ : state) {
|
59 |
+
state.PauseTiming();
|
60 |
+
uint16_t x_max = UINT16_C(0x7E00) /* NaN */;
|
61 |
+
rmax(elements * sizeof(uint16_t), x.data(), &x_max);
|
62 |
+
if (++buffer_index == num_buffers) {
|
63 |
+
buffer_index = 0;
|
64 |
+
}
|
65 |
+
state.ResumeTiming();
|
66 |
+
|
67 |
+
uint16_t y_sum = UINT16_C(0x7E00) /* NaN */;
|
68 |
+
raddstoreexpminusmax(elements * sizeof(uint16_t), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, ¶ms);
|
69 |
+
}
|
70 |
+
|
71 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
72 |
+
if (cpu_frequency != 0) {
|
73 |
+
state.counters["cpufreq"] = cpu_frequency;
|
74 |
+
}
|
75 |
+
|
76 |
+
const size_t elements_per_iteration = elements;
|
77 |
+
state.counters["elements"] =
|
78 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
79 |
+
|
80 |
+
const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
|
81 |
+
state.counters["bytes"] =
|
82 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
83 |
+
}
|
84 |
+
|
85 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
86 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32,
|
87 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
88 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
|
89 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
90 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
91 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
92 |
+
->UseRealTime();
|
93 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc2,
|
94 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
95 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2,
|
96 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
97 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
98 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
99 |
+
->UseRealTime();
|
100 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc4,
|
101 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
102 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4,
|
103 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
104 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
105 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
106 |
+
->UseRealTime();
|
107 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40,
|
108 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
109 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
|
110 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
111 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
112 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
113 |
+
->UseRealTime();
|
114 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc2,
|
115 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
116 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2,
|
117 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
118 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
119 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
120 |
+
->UseRealTime();
|
121 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc5,
|
122 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
123 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5,
|
124 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
125 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
126 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
127 |
+
->UseRealTime();
|
128 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48,
|
129 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
130 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48,
|
131 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
132 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
133 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
134 |
+
->UseRealTime();
|
135 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc2,
|
136 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
137 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc2,
|
138 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
139 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
140 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
141 |
+
->UseRealTime();
|
142 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc3,
|
143 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
144 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3,
|
145 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
146 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
147 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
148 |
+
->UseRealTime();
|
149 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64,
|
150 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
151 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64,
|
152 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
153 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
154 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
155 |
+
->UseRealTime();
|
156 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc2,
|
157 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
158 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc2,
|
159 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
160 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
161 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
162 |
+
->UseRealTime();
|
163 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc4,
|
164 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
165 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc4,
|
166 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
167 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
168 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
169 |
+
->UseRealTime();
|
170 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72,
|
171 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
172 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72,
|
173 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
174 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
175 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
176 |
+
->UseRealTime();
|
177 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72_acc3,
|
178 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
179 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72_acc3,
|
180 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
181 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
182 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
183 |
+
->UseRealTime();
|
184 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80,
|
185 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
186 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80,
|
187 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
188 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
189 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
190 |
+
->UseRealTime();
|
191 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc2,
|
192 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
193 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc2,
|
194 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
195 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
196 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
197 |
+
->UseRealTime();
|
198 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc5,
|
199 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
200 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc5,
|
201 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
202 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
203 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
204 |
+
->UseRealTime();
|
205 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96,
|
206 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
207 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96,
|
208 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
209 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
210 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
211 |
+
->UseRealTime();
|
212 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc2,
|
213 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
214 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2,
|
215 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
216 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
217 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
218 |
+
->UseRealTime();
|
219 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc3,
|
220 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
221 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3,
|
222 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
223 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
224 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
225 |
+
->UseRealTime();
|
226 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc6,
|
227 |
+
xnn_f16_rmax_ukernel__neonfp16arith,
|
228 |
+
xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6,
|
229 |
+
xnn_init_f16_expminus_fp16arith_rr2_p2_params,
|
230 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
231 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
232 |
+
->UseRealTime();
|
233 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
234 |
+
|
235 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
236 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32,
|
237 |
+
xnn_f16_rmax_ukernel__f16c,
|
238 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32,
|
239 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
240 |
+
benchmark::utils::CheckAVX2)
|
241 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
242 |
+
->UseRealTime();
|
243 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc2,
|
244 |
+
xnn_f16_rmax_ukernel__f16c,
|
245 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc2,
|
246 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
247 |
+
benchmark::utils::CheckAVX2)
|
248 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
249 |
+
->UseRealTime();
|
250 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc4,
|
251 |
+
xnn_f16_rmax_ukernel__f16c,
|
252 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc4,
|
253 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
254 |
+
benchmark::utils::CheckAVX2)
|
255 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
256 |
+
->UseRealTime();
|
257 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40,
|
258 |
+
xnn_f16_rmax_ukernel__f16c,
|
259 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
|
260 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
261 |
+
benchmark::utils::CheckAVX2)
|
262 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
263 |
+
->UseRealTime();
|
264 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc2,
|
265 |
+
xnn_f16_rmax_ukernel__f16c,
|
266 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc2,
|
267 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
268 |
+
benchmark::utils::CheckAVX2)
|
269 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
270 |
+
->UseRealTime();
|
271 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc5,
|
272 |
+
xnn_f16_rmax_ukernel__f16c,
|
273 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc5,
|
274 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
275 |
+
benchmark::utils::CheckAVX2)
|
276 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
277 |
+
->UseRealTime();
|
278 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48,
|
279 |
+
xnn_f16_rmax_ukernel__f16c,
|
280 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48,
|
281 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
282 |
+
benchmark::utils::CheckAVX2)
|
283 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
284 |
+
->UseRealTime();
|
285 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc2,
|
286 |
+
xnn_f16_rmax_ukernel__f16c,
|
287 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc2,
|
288 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
289 |
+
benchmark::utils::CheckAVX2)
|
290 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
291 |
+
->UseRealTime();
|
292 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc3,
|
293 |
+
xnn_f16_rmax_ukernel__f16c,
|
294 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc3,
|
295 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
296 |
+
benchmark::utils::CheckAVX2)
|
297 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
298 |
+
->UseRealTime();
|
299 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64,
|
300 |
+
xnn_f16_rmax_ukernel__f16c,
|
301 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64,
|
302 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
303 |
+
benchmark::utils::CheckAVX2)
|
304 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
305 |
+
->UseRealTime();
|
306 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc2,
|
307 |
+
xnn_f16_rmax_ukernel__f16c,
|
308 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2,
|
309 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
310 |
+
benchmark::utils::CheckAVX2)
|
311 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
312 |
+
->UseRealTime();
|
313 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc4,
|
314 |
+
xnn_f16_rmax_ukernel__f16c,
|
315 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4,
|
316 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
317 |
+
benchmark::utils::CheckAVX2)
|
318 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
319 |
+
->UseRealTime();
|
320 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72,
|
321 |
+
xnn_f16_rmax_ukernel__f16c,
|
322 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72,
|
323 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
324 |
+
benchmark::utils::CheckAVX2)
|
325 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
326 |
+
->UseRealTime();
|
327 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72_acc3,
|
328 |
+
xnn_f16_rmax_ukernel__f16c,
|
329 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3,
|
330 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
331 |
+
benchmark::utils::CheckAVX2)
|
332 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
333 |
+
->UseRealTime();
|
334 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80,
|
335 |
+
xnn_f16_rmax_ukernel__f16c,
|
336 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80,
|
337 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
338 |
+
benchmark::utils::CheckAVX2)
|
339 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
340 |
+
->UseRealTime();
|
341 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc2,
|
342 |
+
xnn_f16_rmax_ukernel__f16c,
|
343 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2,
|
344 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
345 |
+
benchmark::utils::CheckAVX2)
|
346 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
347 |
+
->UseRealTime();
|
348 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc5,
|
349 |
+
xnn_f16_rmax_ukernel__f16c,
|
350 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5,
|
351 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
352 |
+
benchmark::utils::CheckAVX2)
|
353 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
354 |
+
->UseRealTime();
|
355 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96,
|
356 |
+
xnn_f16_rmax_ukernel__f16c,
|
357 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96,
|
358 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
359 |
+
benchmark::utils::CheckAVX2)
|
360 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
361 |
+
->UseRealTime();
|
362 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc2,
|
363 |
+
xnn_f16_rmax_ukernel__f16c,
|
364 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2,
|
365 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
366 |
+
benchmark::utils::CheckAVX2)
|
367 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
368 |
+
->UseRealTime();
|
369 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc3,
|
370 |
+
xnn_f16_rmax_ukernel__f16c,
|
371 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3,
|
372 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
373 |
+
benchmark::utils::CheckAVX2)
|
374 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
375 |
+
->UseRealTime();
|
376 |
+
BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc6,
|
377 |
+
xnn_f16_rmax_ukernel__f16c,
|
378 |
+
xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6,
|
379 |
+
xnn_init_f16_expminus_avx2_rr1_p2_params,
|
380 |
+
benchmark::utils::CheckAVX2)
|
381 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
382 |
+
->UseRealTime();
|
383 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
384 |
+
|
385 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
386 |
+
BENCHMARK_MAIN();
|
387 |
+
#endif
|
bench/f16-rsum.cc
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/reduce.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_rsum(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_rsum_ukernel_fn rsum,
|
27 |
+
xnn_init_f16_scale_params_fn init_params,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
|
42 |
+
std::generate(input.begin(), input.end(), std::ref(f16rng));
|
43 |
+
|
44 |
+
xnn_f16_scale_params params;
|
45 |
+
init_params(¶ms, /*scale=*/fp16_ieee_from_fp32_value(0.1f));
|
46 |
+
|
47 |
+
uint16_t output = UINT16_C(0x7E00); /* NaN */
|
48 |
+
for (auto _ : state) {
|
49 |
+
rsum(elements * sizeof(uint16_t), input.data(), &output, ¶ms);
|
50 |
+
}
|
51 |
+
|
52 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
53 |
+
if (cpu_frequency != 0) {
|
54 |
+
state.counters["cpufreq"] = cpu_frequency;
|
55 |
+
}
|
56 |
+
|
57 |
+
const size_t elements_per_iteration = elements;
|
58 |
+
state.counters["elements"] =
|
59 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
60 |
+
|
61 |
+
const size_t bytes_per_iteration = elements * sizeof(uint16_t);
|
62 |
+
state.counters["bytes"] =
|
63 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
64 |
+
}
|
65 |
+
|
66 |
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
|
67 |
+
BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x8,
|
68 |
+
xnn_f16_rsum_ukernel__neonfp16arith_x8,
|
69 |
+
xnn_init_f16_scale_fp16arith_params,
|
70 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
71 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
72 |
+
->UseRealTime();
|
73 |
+
BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x16_acc2,
|
74 |
+
xnn_f16_rsum_ukernel__neonfp16arith_x16_acc2,
|
75 |
+
xnn_init_f16_scale_fp16arith_params,
|
76 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
77 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
78 |
+
->UseRealTime();
|
79 |
+
BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x24_acc3,
|
80 |
+
xnn_f16_rsum_ukernel__neonfp16arith_x24_acc3,
|
81 |
+
xnn_init_f16_scale_fp16arith_params,
|
82 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
83 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
84 |
+
->UseRealTime();
|
85 |
+
BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc2,
|
86 |
+
xnn_f16_rsum_ukernel__neonfp16arith_x32_acc2,
|
87 |
+
xnn_init_f16_scale_fp16arith_params,
|
88 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
89 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
90 |
+
->UseRealTime();
|
91 |
+
BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc4,
|
92 |
+
xnn_f16_rsum_ukernel__neonfp16arith_x32_acc4,
|
93 |
+
xnn_init_f16_scale_fp16arith_params,
|
94 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
95 |
+
->Apply(benchmark::utils::ReductionParameters<uint16_t>)
|
96 |
+
->UseRealTime();
|
97 |
+
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
|
98 |
+
|
99 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
100 |
+
BENCHMARK_MAIN();
|
101 |
+
#endif
|
bench/f16-spmm.cc
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2019 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cassert>
|
8 |
+
#include <cmath>
|
9 |
+
#include <cstddef>
|
10 |
+
#include <cstdlib>
|
11 |
+
#include <random>
|
12 |
+
#include <vector>
|
13 |
+
|
14 |
+
#include <benchmark/benchmark.h>
|
15 |
+
#include <fp16/fp16.h>
|
16 |
+
#include "bench/spmm.h"
|
17 |
+
#include "bench/utils.h"
|
18 |
+
|
19 |
+
#include <xnnpack.h>
|
20 |
+
#include <xnnpack/aligned-allocator.h>
|
21 |
+
#include <xnnpack/common.h>
|
22 |
+
#include <xnnpack/microfnptr.h>
|
23 |
+
#include <xnnpack/microparams-init.h>
|
24 |
+
#include <xnnpack/spmm.h>
|
25 |
+
|
26 |
+
static inline bool is_fp16_zero(uint16_t x) {
|
27 |
+
const uint16_t two_x = x + x;
|
28 |
+
return two_x == 0;
|
29 |
+
}
|
30 |
+
|
31 |
+
static void f16_spmm(benchmark::State& state,
|
32 |
+
xnn_f16_spmm_minmax_ukernel_fn spmm, uint32_t mr, uint32_t nr, float sparsity,
|
33 |
+
xnn_init_f16_minmax_params_fn init_params,
|
34 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
35 |
+
{
|
36 |
+
if (isa_check && !isa_check(state)) {
|
37 |
+
return;
|
38 |
+
}
|
39 |
+
const size_t mc = state.range(0);
|
40 |
+
const size_t nc = state.range(1);
|
41 |
+
const size_t kc = state.range(2);
|
42 |
+
|
43 |
+
std::random_device random_device;
|
44 |
+
auto rng = std::mt19937(random_device());
|
45 |
+
std::uniform_real_distribution<float> f32dist;
|
46 |
+
std::uniform_real_distribution<float> pdist;
|
47 |
+
|
48 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(kc * mc);
|
49 |
+
// Think of b as (n/nr + n % nr) x k, expansion happens later.
|
50 |
+
const size_t ncols = nc / nr + nc % nr;
|
51 |
+
std::vector<uint16_t> b(ncols * kc);
|
52 |
+
std::vector<uint16_t> bias(nc);
|
53 |
+
// Number of non-zero weights per N (output channel).
|
54 |
+
std::vector<uint32_t> nmap(nc);
|
55 |
+
// Mapping from index of non-zero weight to increment of K (input channel) following this index.
|
56 |
+
std::vector<int32_t> dmap(nc * kc);
|
57 |
+
std::vector<uint16_t> w(nc * kc + nc);
|
58 |
+
std::vector<uint16_t> output(nc * mc);
|
59 |
+
|
60 |
+
std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
|
61 |
+
std::generate(b.begin(), b.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
|
62 |
+
std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
|
63 |
+
std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
|
64 |
+
std::fill(nmap.begin(), nmap.end(), 0);
|
65 |
+
std::fill(dmap.begin(), dmap.end(), 0);
|
66 |
+
std::fill(w.begin(), w.end(), 0);
|
67 |
+
|
68 |
+
for (uint16_t& b_value : b) {
|
69 |
+
if (pdist(rng) <= sparsity) {
|
70 |
+
b_value = 0;
|
71 |
+
}
|
72 |
+
}
|
73 |
+
|
74 |
+
uint32_t nnz = 0;
|
75 |
+
uint32_t wcnt = 0;
|
76 |
+
size_t last_kk = 0;
|
77 |
+
bool first_nzz = true;
|
78 |
+
size_t first_kk = 0;
|
79 |
+
for (size_t nn = 0; nn < nc / nr; nn++) {
|
80 |
+
for (size_t i = 0; i < nr; ++i)
|
81 |
+
w[wcnt++] = bias[nr * nn + i];
|
82 |
+
for (size_t kk = 0; kk < kc; kk++) {
|
83 |
+
if (!is_fp16_zero(b[nn * kc + kk])) {
|
84 |
+
// Every non-zero actually corresponds to nr adjacent non-zeros.
|
85 |
+
for (size_t i = 0; i < nr; ++i)
|
86 |
+
w[wcnt++] = fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
|
87 |
+
// Skip the very first non-zero weight as we record only the difference.
|
88 |
+
if (first_nzz) {
|
89 |
+
first_kk = kk;
|
90 |
+
} else {
|
91 |
+
const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
|
92 |
+
dmap[nnz++] = increment;
|
93 |
+
}
|
94 |
+
last_kk = kk;
|
95 |
+
first_nzz = false;
|
96 |
+
nmap[nn] += 1;
|
97 |
+
}
|
98 |
+
}
|
99 |
+
}
|
100 |
+
|
101 |
+
// now we've constructed the matrix for the blocked part and switch to the
|
102 |
+
// leftovers, which we do as nr=1 always.
|
103 |
+
for (size_t nn = nc / nr; nn < ncols; nn++) {
|
104 |
+
w[wcnt++] = bias[(nc / nr) * nr + (nn - nc / nr)];
|
105 |
+
for (size_t kk = 0; kk < kc; kk++) {
|
106 |
+
if (!is_fp16_zero(b[nn * kc + kk])) {
|
107 |
+
// Every non-zero actually corresponds to nr adjacent non-zeros.
|
108 |
+
w[wcnt++] = b[nn * kc + kk];
|
109 |
+
// Skip the very first non-zero weight as we record only the difference.
|
110 |
+
if (first_nzz) {
|
111 |
+
first_kk = kk;
|
112 |
+
} else {
|
113 |
+
const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
|
114 |
+
dmap[nnz++] = increment;
|
115 |
+
}
|
116 |
+
last_kk = kk;
|
117 |
+
first_nzz = false;
|
118 |
+
nmap[nn] += 1;
|
119 |
+
}
|
120 |
+
}
|
121 |
+
}
|
122 |
+
// In the end, we must return input pointer to the initial value.
|
123 |
+
const int64_t increment = int32_t(first_kk - last_kk) * int32_t(mc * sizeof(uint16_t));
|
124 |
+
dmap[nnz++] = increment;
|
125 |
+
|
126 |
+
// Generate expanded b which will be used in reference calculation.
|
127 |
+
// Everywhere there is input non-zero in the original we copy it and add an
|
128 |
+
// adjacent non-zero with incremented weight value.
|
129 |
+
std::vector<uint16_t> b_full(nc * kc);
|
130 |
+
if (nr == 1) {
|
131 |
+
b_full = b;
|
132 |
+
}
|
133 |
+
else {
|
134 |
+
for (size_t nn = 0; nn < nc / nr; nn++) {
|
135 |
+
for (size_t kk = 0; kk < kc; kk++) {
|
136 |
+
if (b[nn * kc + kk] != 0.0f) {
|
137 |
+
for (size_t i = 0; i < nr; ++i)
|
138 |
+
b_full[nr * nn * kc + i * kc + kk] = fp16_ieee_from_fp32_value(
|
139 |
+
fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
|
140 |
+
}
|
141 |
+
}
|
142 |
+
}
|
143 |
+
for (size_t nn = nc / nr; nn < ncols; nn++) {
|
144 |
+
for (size_t kk = 0; kk < kc; kk++) {
|
145 |
+
if (b[nn * kc + kk] != 0.0f) {
|
146 |
+
b_full[nr * (nc / nr) * kc + (nn - nc / nr) * kc + kk] = b[nn * kc + kk];
|
147 |
+
}
|
148 |
+
}
|
149 |
+
}
|
150 |
+
}
|
151 |
+
|
152 |
+
// Micro-kernel can access one element beyond w and dmap for software pipelining.
|
153 |
+
w.resize(wcnt + 1);
|
154 |
+
dmap.resize(nnz + 1);
|
155 |
+
|
156 |
+
// Prepare parameters.
|
157 |
+
xnn_f16_minmax_params params;
|
158 |
+
init_params(¶ms, 0xFC00 /* -inf */, 0x7C00 /* inf */);
|
159 |
+
|
160 |
+
for (auto _ : state) {
|
161 |
+
|
162 |
+
spmm(mc * sizeof(uint16_t), nc,
|
163 |
+
input.data() + first_kk * mc,
|
164 |
+
w.data(), dmap.data(), nmap.data(),
|
165 |
+
output.data(), mc * sizeof(uint16_t),
|
166 |
+
¶ms);
|
167 |
+
}
|
168 |
+
|
169 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
170 |
+
if (cpu_frequency != 0) {
|
171 |
+
state.counters["cpufreq"] = cpu_frequency;
|
172 |
+
}
|
173 |
+
|
174 |
+
state.counters["FLOPS"] = benchmark::Counter(
|
175 |
+
uint64_t(state.iterations()) * 2 * mc * nnz, benchmark::Counter::kIsRate);
|
176 |
+
|
177 |
+
state.counters["EffFLOPS"] = benchmark::Counter(
|
178 |
+
uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
|
179 |
+
}
|
180 |
+
|
181 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
182 |
+
static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
|
183 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f,
|
184 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
185 |
+
}
|
186 |
+
static void spmm80_8x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
|
187 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_pipelined, 8, 1, 0.8f,
|
188 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
189 |
+
}
|
190 |
+
static void spmm80_8x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
|
191 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_x2, 8, 1, 0.8f,
|
192 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
193 |
+
}
|
194 |
+
static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
|
195 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f,
|
196 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
197 |
+
}
|
198 |
+
static void spmm80_16x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
|
199 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_pipelined, 16, 1, 0.8f,
|
200 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
201 |
+
}
|
202 |
+
static void spmm80_16x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
|
203 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_x2, 16, 1, 0.8f,
|
204 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
205 |
+
}
|
206 |
+
static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
|
207 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f,
|
208 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
209 |
+
}
|
210 |
+
static void spmm80_24x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
|
211 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_pipelined, 24, 1, 0.8f,
|
212 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
213 |
+
}
|
214 |
+
static void spmm80_24x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
|
215 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_x2, 24, 1, 0.8f,
|
216 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
217 |
+
}
|
218 |
+
static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
|
219 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f,
|
220 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
221 |
+
}
|
222 |
+
static void spmm80_32x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
|
223 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_pipelined, 32, 1, 0.8f,
|
224 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
225 |
+
}
|
226 |
+
static void spmm80_32x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
|
227 |
+
f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_x2, 32, 1, 0.8f,
|
228 |
+
xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
|
229 |
+
}
|
230 |
+
|
231 |
+
BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_pipelined)
|
232 |
+
BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_pipelined)
|
233 |
+
BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_pipelined)
|
234 |
+
BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_pipelined)
|
235 |
+
BENCHMARK_SPMM(spmm80_8x1__neonfp16arith)
|
236 |
+
BENCHMARK_SPMM(spmm80_16x1__neonfp16arith)
|
237 |
+
BENCHMARK_SPMM(spmm80_24x1__neonfp16arith)
|
238 |
+
BENCHMARK_SPMM(spmm80_32x1__neonfp16arith)
|
239 |
+
BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_x2)
|
240 |
+
BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_x2)
|
241 |
+
BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_x2)
|
242 |
+
BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_x2)
|
243 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
244 |
+
|
245 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
246 |
+
BENCHMARK_MAIN();
|
247 |
+
#endif
|
bench/f16-velu.cc
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vunary.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_velu(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_velu_ukernel_fn elu,
|
27 |
+
xnn_init_f16_elu_params_fn init_params,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t num_elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-9.0f, 9.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
|
42 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
|
43 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
44 |
+
std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
|
45 |
+
|
46 |
+
union xnn_f16_elu_params params;
|
47 |
+
init_params(¶ms,
|
48 |
+
UINT16_C(0x3C00) /* prescale = 1.0h */,
|
49 |
+
UINT16_C(0x3C00) /* alpha = 1.0h */,
|
50 |
+
UINT16_C(0x3C00) /* beta = 1.0h */);
|
51 |
+
for (auto _ : state) {
|
52 |
+
elu(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
|
53 |
+
}
|
54 |
+
|
55 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
56 |
+
if (cpu_frequency != 0) {
|
57 |
+
state.counters["cpufreq"] = cpu_frequency;
|
58 |
+
}
|
59 |
+
|
60 |
+
const size_t elements_per_iteration = num_elements;
|
61 |
+
state.counters["elements"] =
|
62 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
63 |
+
|
64 |
+
const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
|
65 |
+
state.counters["bytes"] =
|
66 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
67 |
+
}
|
68 |
+
|
69 |
+
|
70 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
71 |
+
BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x8,
|
72 |
+
xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x8,
|
73 |
+
xnn_init_f16_elu_fp16arith_rr1_p3_params,
|
74 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
75 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
76 |
+
->UseRealTime();
|
77 |
+
BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x16,
|
78 |
+
xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
|
79 |
+
xnn_init_f16_elu_fp16arith_rr1_p3_params,
|
80 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
81 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
82 |
+
->UseRealTime();
|
83 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
84 |
+
|
85 |
+
|
86 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
87 |
+
BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x8,
|
88 |
+
xnn_f16_velu_ukernel__avx2_rr1_p3_x8,
|
89 |
+
xnn_init_f16_elu_avx2_rr1_p3_params,
|
90 |
+
benchmark::utils::CheckAVX2)
|
91 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
92 |
+
->UseRealTime();
|
93 |
+
BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x16,
|
94 |
+
xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
|
95 |
+
xnn_init_f16_elu_avx2_rr1_p3_params,
|
96 |
+
benchmark::utils::CheckAVX2)
|
97 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
98 |
+
->UseRealTime();
|
99 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
100 |
+
|
101 |
+
|
102 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
103 |
+
BENCHMARK_MAIN();
|
104 |
+
#endif
|
bench/f16-vsigmoid.cc
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vunary.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_vsigmoid(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_vsigmoid_ukernel_fn sigmoid,
|
27 |
+
xnn_init_f16_sigmoid_params_fn init_params,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t num_elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
|
42 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
|
43 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
44 |
+
std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
|
45 |
+
|
46 |
+
xnn_f16_sigmoid_params params;
|
47 |
+
init_params(¶ms);
|
48 |
+
for (auto _ : state) {
|
49 |
+
sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
|
50 |
+
}
|
51 |
+
|
52 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
53 |
+
if (cpu_frequency != 0) {
|
54 |
+
state.counters["cpufreq"] = cpu_frequency;
|
55 |
+
}
|
56 |
+
|
57 |
+
const size_t elements_per_iteration = num_elements;
|
58 |
+
state.counters["elements"] =
|
59 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
60 |
+
|
61 |
+
const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
|
62 |
+
state.counters["bytes"] =
|
63 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
64 |
+
}
|
65 |
+
|
66 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
67 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x8,
|
68 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x8,
|
69 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
70 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
71 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
72 |
+
->UseRealTime();
|
73 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x16,
|
74 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x16,
|
75 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
76 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
77 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
78 |
+
->UseRealTime();
|
79 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x24,
|
80 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x24,
|
81 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
82 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
83 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
84 |
+
->UseRealTime();
|
85 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x32,
|
86 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x32,
|
87 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
88 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
89 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
90 |
+
->UseRealTime();
|
91 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x40,
|
92 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x40,
|
93 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
94 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
95 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
96 |
+
->UseRealTime();
|
97 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x48,
|
98 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x48,
|
99 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
100 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
101 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
102 |
+
->UseRealTime();
|
103 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x56,
|
104 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x56,
|
105 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
106 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
107 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
108 |
+
->UseRealTime();
|
109 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x64,
|
110 |
+
xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x64,
|
111 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
112 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
113 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
114 |
+
->UseRealTime();
|
115 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
116 |
+
|
117 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
118 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x8,
|
119 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x8,
|
120 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
121 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
122 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
123 |
+
->UseRealTime();
|
124 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x16,
|
125 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x16,
|
126 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
127 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
128 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
129 |
+
->UseRealTime();
|
130 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x24,
|
131 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x24,
|
132 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
133 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
134 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
135 |
+
->UseRealTime();
|
136 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x32,
|
137 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x32,
|
138 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
139 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
140 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
141 |
+
->UseRealTime();
|
142 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x40,
|
143 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
|
144 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
145 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
146 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
147 |
+
->UseRealTime();
|
148 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x48,
|
149 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x48,
|
150 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
151 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
152 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
153 |
+
->UseRealTime();
|
154 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x56,
|
155 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x56,
|
156 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
157 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
158 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
159 |
+
->UseRealTime();
|
160 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x64,
|
161 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x64,
|
162 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
163 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
164 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
165 |
+
->UseRealTime();
|
166 |
+
|
167 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x8,
|
168 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x8,
|
169 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
170 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
171 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
172 |
+
->UseRealTime();
|
173 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x16,
|
174 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
|
175 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
176 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
177 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
178 |
+
->UseRealTime();
|
179 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x24,
|
180 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x24,
|
181 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
182 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
183 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
184 |
+
->UseRealTime();
|
185 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x32,
|
186 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x32,
|
187 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
188 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
189 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
190 |
+
->UseRealTime();
|
191 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x40,
|
192 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x40,
|
193 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
194 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
195 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
196 |
+
->UseRealTime();
|
197 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x48,
|
198 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x48,
|
199 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
200 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
201 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
202 |
+
->UseRealTime();
|
203 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x56,
|
204 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x56,
|
205 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
206 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
207 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
208 |
+
->UseRealTime();
|
209 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x64,
|
210 |
+
xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x64,
|
211 |
+
xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
|
212 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
213 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
214 |
+
->UseRealTime();
|
215 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
216 |
+
|
217 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
218 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x8,
|
219 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x8,
|
220 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
221 |
+
benchmark::utils::CheckAVX2)
|
222 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
223 |
+
->UseRealTime();
|
224 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x16,
|
225 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x16,
|
226 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
227 |
+
benchmark::utils::CheckAVX2)
|
228 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
229 |
+
->UseRealTime();
|
230 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x24,
|
231 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x24,
|
232 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
233 |
+
benchmark::utils::CheckAVX2)
|
234 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
235 |
+
->UseRealTime();
|
236 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x32,
|
237 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x32,
|
238 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
239 |
+
benchmark::utils::CheckAVX2)
|
240 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
241 |
+
->UseRealTime();
|
242 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x40,
|
243 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x40,
|
244 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
245 |
+
benchmark::utils::CheckAVX2)
|
246 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
247 |
+
->UseRealTime();
|
248 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x48,
|
249 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x48,
|
250 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
251 |
+
benchmark::utils::CheckAVX2)
|
252 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
253 |
+
->UseRealTime();
|
254 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x56,
|
255 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x56,
|
256 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
257 |
+
benchmark::utils::CheckAVX2)
|
258 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
259 |
+
->UseRealTime();
|
260 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x64,
|
261 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x64,
|
262 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
263 |
+
benchmark::utils::CheckAVX2)
|
264 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
265 |
+
->UseRealTime();
|
266 |
+
|
267 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x8,
|
268 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x8,
|
269 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
270 |
+
benchmark::utils::CheckAVX2)
|
271 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
272 |
+
->UseRealTime();
|
273 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x16,
|
274 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x16,
|
275 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
276 |
+
benchmark::utils::CheckAVX2)
|
277 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
278 |
+
->UseRealTime();
|
279 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x24,
|
280 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x24,
|
281 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
282 |
+
benchmark::utils::CheckAVX2)
|
283 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
284 |
+
->UseRealTime();
|
285 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x32,
|
286 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
|
287 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
288 |
+
benchmark::utils::CheckAVX2)
|
289 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
290 |
+
->UseRealTime();
|
291 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x40,
|
292 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x40,
|
293 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
294 |
+
benchmark::utils::CheckAVX2)
|
295 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
296 |
+
->UseRealTime();
|
297 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x48,
|
298 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x48,
|
299 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
300 |
+
benchmark::utils::CheckAVX2)
|
301 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
302 |
+
->UseRealTime();
|
303 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x56,
|
304 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x56,
|
305 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
306 |
+
benchmark::utils::CheckAVX2)
|
307 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
308 |
+
->UseRealTime();
|
309 |
+
BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x64,
|
310 |
+
xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x64,
|
311 |
+
xnn_init_f16_sigmoid_avx2_rr1_p2_params,
|
312 |
+
benchmark::utils::CheckAVX2)
|
313 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
314 |
+
->UseRealTime();
|
315 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
316 |
+
|
317 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
318 |
+
BENCHMARK_MAIN();
|
319 |
+
#endif
|
bench/f16-vsqrt.cc
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2022 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vunary.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_vsqrt(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_vsqrt_ukernel_fn sqrt,
|
27 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
28 |
+
{
|
29 |
+
if (isa_check && !isa_check(state)) {
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
|
33 |
+
const size_t num_elements = state.range(0);
|
34 |
+
|
35 |
+
std::random_device random_device;
|
36 |
+
auto rng = std::mt19937(random_device());
|
37 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
|
38 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
39 |
+
|
40 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
|
42 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
43 |
+
std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
|
44 |
+
|
45 |
+
for (auto _ : state) {
|
46 |
+
sqrt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr);
|
47 |
+
}
|
48 |
+
|
49 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
50 |
+
if (cpu_frequency != 0) {
|
51 |
+
state.counters["cpufreq"] = cpu_frequency;
|
52 |
+
}
|
53 |
+
|
54 |
+
const size_t elements_per_iteration = num_elements;
|
55 |
+
state.counters["elements"] =
|
56 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
57 |
+
|
58 |
+
const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
|
59 |
+
state.counters["bytes"] =
|
60 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
61 |
+
}
|
62 |
+
|
63 |
+
|
64 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
65 |
+
BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x8,
|
66 |
+
xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x8,
|
67 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
68 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
69 |
+
->UseRealTime();
|
70 |
+
BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x16,
|
71 |
+
xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x16,
|
72 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
73 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
74 |
+
->UseRealTime();
|
75 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
76 |
+
|
77 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
78 |
+
BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x8,
|
79 |
+
xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x8,
|
80 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
81 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
82 |
+
->UseRealTime();
|
83 |
+
BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x16,
|
84 |
+
xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x16,
|
85 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
86 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
87 |
+
->UseRealTime();
|
88 |
+
BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x24,
|
89 |
+
xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x24,
|
90 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
91 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
92 |
+
->UseRealTime();
|
93 |
+
BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x32,
|
94 |
+
xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x32,
|
95 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
96 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
97 |
+
->UseRealTime();
|
98 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
99 |
+
|
100 |
+
#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
101 |
+
BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x1,
|
102 |
+
xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x1,
|
103 |
+
benchmark::utils::CheckFP16ARITH)
|
104 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
105 |
+
->UseRealTime();
|
106 |
+
BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x2,
|
107 |
+
xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x2,
|
108 |
+
benchmark::utils::CheckFP16ARITH)
|
109 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
110 |
+
->UseRealTime();
|
111 |
+
BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x4,
|
112 |
+
xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x4,
|
113 |
+
benchmark::utils::CheckFP16ARITH)
|
114 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
115 |
+
->UseRealTime();
|
116 |
+
#endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
117 |
+
|
118 |
+
|
119 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
120 |
+
BENCHMARK_MAIN();
|
121 |
+
#endif
|
bench/f16-vtanh.cc
ADDED
@@ -0,0 +1,807 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright 2023 Google LLC
|
2 |
+
//
|
3 |
+
// This source code is licensed under the BSD-style license found in the
|
4 |
+
// LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
#include <algorithm>
|
7 |
+
#include <cmath>
|
8 |
+
#include <functional>
|
9 |
+
#include <random>
|
10 |
+
#include <vector>
|
11 |
+
|
12 |
+
#include <benchmark/benchmark.h>
|
13 |
+
#include <fp16/fp16.h>
|
14 |
+
#include "bench/utils.h"
|
15 |
+
|
16 |
+
#include <xnnpack.h>
|
17 |
+
#include <xnnpack/aligned-allocator.h>
|
18 |
+
#include <xnnpack/common.h>
|
19 |
+
#include <xnnpack/microfnptr.h>
|
20 |
+
#include <xnnpack/microparams-init.h>
|
21 |
+
#include <xnnpack/vunary.h>
|
22 |
+
|
23 |
+
|
24 |
+
static void f16_vtanh(
|
25 |
+
benchmark::State& state,
|
26 |
+
xnn_f16_vtanh_ukernel_fn tanh,
|
27 |
+
xnn_init_f16_tanh_params_fn init_params = nullptr,
|
28 |
+
benchmark::utils::IsaCheckFunction isa_check = nullptr)
|
29 |
+
{
|
30 |
+
if (isa_check != nullptr && !isa_check(state)) {
|
31 |
+
return;
|
32 |
+
}
|
33 |
+
|
34 |
+
const size_t num_elements = state.range(0);
|
35 |
+
|
36 |
+
std::random_device random_device;
|
37 |
+
auto rng = std::mt19937(random_device());
|
38 |
+
auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.0f, 5.0f), std::ref(rng));
|
39 |
+
auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
|
40 |
+
|
41 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
|
42 |
+
std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
|
43 |
+
std::generate(x.begin(), x.end(), std::ref(f16rng));
|
44 |
+
std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
|
45 |
+
|
46 |
+
xnn_f16_tanh_params params;
|
47 |
+
if (init_params != nullptr) {
|
48 |
+
init_params(¶ms);
|
49 |
+
}
|
50 |
+
for (auto _ : state) {
|
51 |
+
tanh(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
|
52 |
+
}
|
53 |
+
|
54 |
+
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
|
55 |
+
if (cpu_frequency != 0) {
|
56 |
+
state.counters["cpufreq"] = cpu_frequency;
|
57 |
+
}
|
58 |
+
|
59 |
+
const size_t elements_per_iteration = num_elements;
|
60 |
+
state.counters["elements"] =
|
61 |
+
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
|
62 |
+
|
63 |
+
const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
|
64 |
+
state.counters["bytes"] =
|
65 |
+
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
|
70 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x8,
|
71 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x8,
|
72 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
73 |
+
benchmark::utils::CheckAVX2)
|
74 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
75 |
+
->UseRealTime();
|
76 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x16,
|
77 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x16,
|
78 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
79 |
+
benchmark::utils::CheckAVX2)
|
80 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
81 |
+
->UseRealTime();
|
82 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x24,
|
83 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x24,
|
84 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
85 |
+
benchmark::utils::CheckAVX2)
|
86 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
87 |
+
->UseRealTime();
|
88 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x32,
|
89 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x32,
|
90 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
91 |
+
benchmark::utils::CheckAVX2)
|
92 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
93 |
+
->UseRealTime();
|
94 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x40,
|
95 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x40,
|
96 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
97 |
+
benchmark::utils::CheckAVX2)
|
98 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
99 |
+
->UseRealTime();
|
100 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x48,
|
101 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x48,
|
102 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
103 |
+
benchmark::utils::CheckAVX2)
|
104 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
105 |
+
->UseRealTime();
|
106 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x56,
|
107 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x56,
|
108 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
109 |
+
benchmark::utils::CheckAVX2)
|
110 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
111 |
+
->UseRealTime();
|
112 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x64,
|
113 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x64,
|
114 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
115 |
+
benchmark::utils::CheckAVX2)
|
116 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
117 |
+
->UseRealTime();
|
118 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x72,
|
119 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x72,
|
120 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
121 |
+
benchmark::utils::CheckAVX2)
|
122 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
123 |
+
->UseRealTime();
|
124 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x80,
|
125 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x80,
|
126 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
127 |
+
benchmark::utils::CheckAVX2)
|
128 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
129 |
+
->UseRealTime();
|
130 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x8,
|
131 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x8,
|
132 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
133 |
+
benchmark::utils::CheckAVX2)
|
134 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
135 |
+
->UseRealTime();
|
136 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x16,
|
137 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x16,
|
138 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
139 |
+
benchmark::utils::CheckAVX2)
|
140 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
141 |
+
->UseRealTime();
|
142 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x24,
|
143 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x24,
|
144 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
145 |
+
benchmark::utils::CheckAVX2)
|
146 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
147 |
+
->UseRealTime();
|
148 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x32,
|
149 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x32,
|
150 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
151 |
+
benchmark::utils::CheckAVX2)
|
152 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
153 |
+
->UseRealTime();
|
154 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x40,
|
155 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x40,
|
156 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
157 |
+
benchmark::utils::CheckAVX2)
|
158 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
159 |
+
->UseRealTime();
|
160 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x48,
|
161 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x48,
|
162 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
163 |
+
benchmark::utils::CheckAVX2)
|
164 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
165 |
+
->UseRealTime();
|
166 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x56,
|
167 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x56,
|
168 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
169 |
+
benchmark::utils::CheckAVX2)
|
170 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
171 |
+
->UseRealTime();
|
172 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x64,
|
173 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x64,
|
174 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
175 |
+
benchmark::utils::CheckAVX2)
|
176 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
177 |
+
->UseRealTime();
|
178 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x72,
|
179 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x72,
|
180 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
181 |
+
benchmark::utils::CheckAVX2)
|
182 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
183 |
+
->UseRealTime();
|
184 |
+
BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x80,
|
185 |
+
xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x80,
|
186 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
187 |
+
benchmark::utils::CheckAVX2)
|
188 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
189 |
+
->UseRealTime();
|
190 |
+
|
191 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x8,
|
192 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x8,
|
193 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
194 |
+
benchmark::utils::CheckFMA3)
|
195 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
196 |
+
->UseRealTime();
|
197 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x16,
|
198 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x16,
|
199 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
200 |
+
benchmark::utils::CheckFMA3)
|
201 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
202 |
+
->UseRealTime();
|
203 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x24,
|
204 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x24,
|
205 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
206 |
+
benchmark::utils::CheckFMA3)
|
207 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
208 |
+
->UseRealTime();
|
209 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x32,
|
210 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x32,
|
211 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
212 |
+
benchmark::utils::CheckFMA3)
|
213 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
214 |
+
->UseRealTime();
|
215 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x40,
|
216 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x40,
|
217 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
218 |
+
benchmark::utils::CheckFMA3)
|
219 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
220 |
+
->UseRealTime();
|
221 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x48,
|
222 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x48,
|
223 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
224 |
+
benchmark::utils::CheckFMA3)
|
225 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
226 |
+
->UseRealTime();
|
227 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x56,
|
228 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x56,
|
229 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
230 |
+
benchmark::utils::CheckFMA3)
|
231 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
232 |
+
->UseRealTime();
|
233 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x64,
|
234 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x64,
|
235 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
236 |
+
benchmark::utils::CheckFMA3)
|
237 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
238 |
+
->UseRealTime();
|
239 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x72,
|
240 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x72,
|
241 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
242 |
+
benchmark::utils::CheckFMA3)
|
243 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
244 |
+
->UseRealTime();
|
245 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x80,
|
246 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x80,
|
247 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
248 |
+
benchmark::utils::CheckFMA3)
|
249 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
250 |
+
->UseRealTime();
|
251 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x8,
|
252 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x8,
|
253 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
254 |
+
benchmark::utils::CheckFMA3)
|
255 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
256 |
+
->UseRealTime();
|
257 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x16,
|
258 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x16,
|
259 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
260 |
+
benchmark::utils::CheckFMA3)
|
261 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
262 |
+
->UseRealTime();
|
263 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x24,
|
264 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x24,
|
265 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
266 |
+
benchmark::utils::CheckFMA3)
|
267 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
268 |
+
->UseRealTime();
|
269 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x32,
|
270 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x32,
|
271 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
272 |
+
benchmark::utils::CheckFMA3)
|
273 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
274 |
+
->UseRealTime();
|
275 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x40,
|
276 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x40,
|
277 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
278 |
+
benchmark::utils::CheckFMA3)
|
279 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
280 |
+
->UseRealTime();
|
281 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x48,
|
282 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x48,
|
283 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
284 |
+
benchmark::utils::CheckFMA3)
|
285 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
286 |
+
->UseRealTime();
|
287 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x56,
|
288 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x56,
|
289 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
290 |
+
benchmark::utils::CheckFMA3)
|
291 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
292 |
+
->UseRealTime();
|
293 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x64,
|
294 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x64,
|
295 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
296 |
+
benchmark::utils::CheckFMA3)
|
297 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
298 |
+
->UseRealTime();
|
299 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x72,
|
300 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x72,
|
301 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
302 |
+
benchmark::utils::CheckFMA3)
|
303 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
304 |
+
->UseRealTime();
|
305 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x80,
|
306 |
+
xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x80,
|
307 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
308 |
+
benchmark::utils::CheckFMA3)
|
309 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
310 |
+
->UseRealTime();
|
311 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x8,
|
312 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x8,
|
313 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
314 |
+
benchmark::utils::CheckFMA3)
|
315 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
316 |
+
->UseRealTime();
|
317 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x16,
|
318 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x16,
|
319 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
320 |
+
benchmark::utils::CheckFMA3)
|
321 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
322 |
+
->UseRealTime();
|
323 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x24,
|
324 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x24,
|
325 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
326 |
+
benchmark::utils::CheckFMA3)
|
327 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
328 |
+
->UseRealTime();
|
329 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x32,
|
330 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x32,
|
331 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
332 |
+
benchmark::utils::CheckFMA3)
|
333 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
334 |
+
->UseRealTime();
|
335 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x40,
|
336 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x40,
|
337 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
338 |
+
benchmark::utils::CheckFMA3)
|
339 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
340 |
+
->UseRealTime();
|
341 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x48,
|
342 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x48,
|
343 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
344 |
+
benchmark::utils::CheckFMA3)
|
345 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
346 |
+
->UseRealTime();
|
347 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x56,
|
348 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x56,
|
349 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
350 |
+
benchmark::utils::CheckFMA3)
|
351 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
352 |
+
->UseRealTime();
|
353 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x64,
|
354 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x64,
|
355 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
356 |
+
benchmark::utils::CheckFMA3)
|
357 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
358 |
+
->UseRealTime();
|
359 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x72,
|
360 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x72,
|
361 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
362 |
+
benchmark::utils::CheckFMA3)
|
363 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
364 |
+
->UseRealTime();
|
365 |
+
BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x80,
|
366 |
+
xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x80,
|
367 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
368 |
+
benchmark::utils::CheckFMA3)
|
369 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
370 |
+
->UseRealTime();
|
371 |
+
|
372 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x8,
|
373 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x8,
|
374 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
375 |
+
benchmark::utils::CheckF16C)
|
376 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
377 |
+
->UseRealTime();
|
378 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x16,
|
379 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x16,
|
380 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
381 |
+
benchmark::utils::CheckF16C)
|
382 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
383 |
+
->UseRealTime();
|
384 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x24,
|
385 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x24,
|
386 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
387 |
+
benchmark::utils::CheckF16C)
|
388 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
389 |
+
->UseRealTime();
|
390 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x32,
|
391 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x32,
|
392 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
393 |
+
benchmark::utils::CheckF16C)
|
394 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
395 |
+
->UseRealTime();
|
396 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x40,
|
397 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x40,
|
398 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
399 |
+
benchmark::utils::CheckF16C)
|
400 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
401 |
+
->UseRealTime();
|
402 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x48,
|
403 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x48,
|
404 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
405 |
+
benchmark::utils::CheckF16C)
|
406 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
407 |
+
->UseRealTime();
|
408 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x56,
|
409 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x56,
|
410 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
411 |
+
benchmark::utils::CheckF16C)
|
412 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
413 |
+
->UseRealTime();
|
414 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x64,
|
415 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x64,
|
416 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
417 |
+
benchmark::utils::CheckF16C)
|
418 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
419 |
+
->UseRealTime();
|
420 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x72,
|
421 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x72,
|
422 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
423 |
+
benchmark::utils::CheckF16C)
|
424 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
425 |
+
->UseRealTime();
|
426 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x80,
|
427 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x80,
|
428 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
429 |
+
benchmark::utils::CheckF16C)
|
430 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
431 |
+
->UseRealTime();
|
432 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x8,
|
433 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x8,
|
434 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
435 |
+
benchmark::utils::CheckF16C)
|
436 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
437 |
+
->UseRealTime();
|
438 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x16,
|
439 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x16,
|
440 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
441 |
+
benchmark::utils::CheckF16C)
|
442 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
443 |
+
->UseRealTime();
|
444 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x24,
|
445 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x24,
|
446 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
447 |
+
benchmark::utils::CheckF16C)
|
448 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
449 |
+
->UseRealTime();
|
450 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x32,
|
451 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x32,
|
452 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
453 |
+
benchmark::utils::CheckF16C)
|
454 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
455 |
+
->UseRealTime();
|
456 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x40,
|
457 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x40,
|
458 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
459 |
+
benchmark::utils::CheckF16C)
|
460 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
461 |
+
->UseRealTime();
|
462 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x48,
|
463 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x48,
|
464 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
465 |
+
benchmark::utils::CheckF16C)
|
466 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
467 |
+
->UseRealTime();
|
468 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x56,
|
469 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x56,
|
470 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
471 |
+
benchmark::utils::CheckF16C)
|
472 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
473 |
+
->UseRealTime();
|
474 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x64,
|
475 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x64,
|
476 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
477 |
+
benchmark::utils::CheckF16C)
|
478 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
479 |
+
->UseRealTime();
|
480 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x72,
|
481 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x72,
|
482 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
483 |
+
benchmark::utils::CheckF16C)
|
484 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
485 |
+
->UseRealTime();
|
486 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x80,
|
487 |
+
xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x80,
|
488 |
+
xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
|
489 |
+
benchmark::utils::CheckF16C)
|
490 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
491 |
+
->UseRealTime();
|
492 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x8,
|
493 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x8,
|
494 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
495 |
+
benchmark::utils::CheckF16C)
|
496 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
497 |
+
->UseRealTime();
|
498 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x16,
|
499 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x16,
|
500 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
501 |
+
benchmark::utils::CheckF16C)
|
502 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
503 |
+
->UseRealTime();
|
504 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x24,
|
505 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x24,
|
506 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
507 |
+
benchmark::utils::CheckF16C)
|
508 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
509 |
+
->UseRealTime();
|
510 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x32,
|
511 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x32,
|
512 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
513 |
+
benchmark::utils::CheckF16C)
|
514 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
515 |
+
->UseRealTime();
|
516 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x40,
|
517 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x40,
|
518 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
519 |
+
benchmark::utils::CheckF16C)
|
520 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
521 |
+
->UseRealTime();
|
522 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x48,
|
523 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x48,
|
524 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
525 |
+
benchmark::utils::CheckF16C)
|
526 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
527 |
+
->UseRealTime();
|
528 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x56,
|
529 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x56,
|
530 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
531 |
+
benchmark::utils::CheckF16C)
|
532 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
533 |
+
->UseRealTime();
|
534 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x64,
|
535 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x64,
|
536 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
537 |
+
benchmark::utils::CheckF16C)
|
538 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
539 |
+
->UseRealTime();
|
540 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x72,
|
541 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x72,
|
542 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
543 |
+
benchmark::utils::CheckF16C)
|
544 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
545 |
+
->UseRealTime();
|
546 |
+
BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x80,
|
547 |
+
xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x80,
|
548 |
+
xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
|
549 |
+
benchmark::utils::CheckF16C)
|
550 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
551 |
+
->UseRealTime();
|
552 |
+
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
|
553 |
+
|
554 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
555 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
|
556 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
|
557 |
+
nullptr,
|
558 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
559 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
560 |
+
->UseRealTime();
|
561 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
|
562 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
|
563 |
+
nullptr,
|
564 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
565 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
566 |
+
->UseRealTime();
|
567 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
|
568 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
|
569 |
+
nullptr,
|
570 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
571 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
572 |
+
->UseRealTime();
|
573 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
|
574 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
|
575 |
+
nullptr,
|
576 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
577 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
578 |
+
->UseRealTime();
|
579 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
|
580 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
|
581 |
+
nullptr,
|
582 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
583 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
584 |
+
->UseRealTime();
|
585 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
|
586 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
|
587 |
+
nullptr,
|
588 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
589 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
590 |
+
->UseRealTime();
|
591 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
|
592 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
|
593 |
+
nullptr,
|
594 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
595 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
596 |
+
->UseRealTime();
|
597 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
|
598 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
|
599 |
+
nullptr,
|
600 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
601 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
602 |
+
->UseRealTime();
|
603 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
|
604 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
|
605 |
+
nullptr,
|
606 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
607 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
608 |
+
->UseRealTime();
|
609 |
+
BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
|
610 |
+
xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
|
611 |
+
nullptr,
|
612 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
613 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
614 |
+
->UseRealTime();
|
615 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
|
616 |
+
|
617 |
+
|
618 |
+
#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
619 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
|
620 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
|
621 |
+
nullptr,
|
622 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
623 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
624 |
+
->UseRealTime();
|
625 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
|
626 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
|
627 |
+
nullptr,
|
628 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
629 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
630 |
+
->UseRealTime();
|
631 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
|
632 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
|
633 |
+
nullptr,
|
634 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
635 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
636 |
+
->UseRealTime();
|
637 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
|
638 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
|
639 |
+
nullptr,
|
640 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
641 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
642 |
+
->UseRealTime();
|
643 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
|
644 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
|
645 |
+
nullptr,
|
646 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
647 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
648 |
+
->UseRealTime();
|
649 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
|
650 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
|
651 |
+
nullptr,
|
652 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
653 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
654 |
+
->UseRealTime();
|
655 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
|
656 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
|
657 |
+
nullptr,
|
658 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
659 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
660 |
+
->UseRealTime();
|
661 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
|
662 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
|
663 |
+
nullptr,
|
664 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
665 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
666 |
+
->UseRealTime();
|
667 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
|
668 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
|
669 |
+
nullptr,
|
670 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
671 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
672 |
+
->UseRealTime();
|
673 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
|
674 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
|
675 |
+
nullptr,
|
676 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
677 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
678 |
+
->UseRealTime();
|
679 |
+
|
680 |
+
|
681 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
|
682 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
|
683 |
+
nullptr,
|
684 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
685 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
686 |
+
->UseRealTime();
|
687 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
|
688 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
|
689 |
+
nullptr,
|
690 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
691 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
692 |
+
->UseRealTime();
|
693 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
|
694 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
|
695 |
+
nullptr,
|
696 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
697 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
698 |
+
->UseRealTime();
|
699 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
|
700 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
|
701 |
+
nullptr,
|
702 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
703 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
704 |
+
->UseRealTime();
|
705 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
|
706 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
|
707 |
+
nullptr,
|
708 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
709 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
710 |
+
->UseRealTime();
|
711 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
|
712 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
|
713 |
+
nullptr,
|
714 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
715 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
716 |
+
->UseRealTime();
|
717 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
|
718 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
|
719 |
+
nullptr,
|
720 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
721 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
722 |
+
->UseRealTime();
|
723 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
|
724 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
|
725 |
+
nullptr,
|
726 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
727 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
728 |
+
->UseRealTime();
|
729 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
|
730 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
|
731 |
+
nullptr,
|
732 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
733 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
734 |
+
->UseRealTime();
|
735 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
|
736 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
|
737 |
+
nullptr,
|
738 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
739 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
740 |
+
->UseRealTime();
|
741 |
+
|
742 |
+
|
743 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
|
744 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
|
745 |
+
nullptr,
|
746 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
747 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
748 |
+
->UseRealTime();
|
749 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
|
750 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
|
751 |
+
nullptr,
|
752 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
753 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
754 |
+
->UseRealTime();
|
755 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
|
756 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
|
757 |
+
nullptr,
|
758 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
759 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
760 |
+
->UseRealTime();
|
761 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
|
762 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
|
763 |
+
nullptr,
|
764 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
765 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
766 |
+
->UseRealTime();
|
767 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
|
768 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
|
769 |
+
nullptr,
|
770 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
771 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
772 |
+
->UseRealTime();
|
773 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
|
774 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
|
775 |
+
nullptr,
|
776 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
777 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
778 |
+
->UseRealTime();
|
779 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
|
780 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
|
781 |
+
nullptr,
|
782 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
783 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
784 |
+
->UseRealTime();
|
785 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
|
786 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
|
787 |
+
nullptr,
|
788 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
789 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
790 |
+
->UseRealTime();
|
791 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
|
792 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
|
793 |
+
nullptr,
|
794 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
795 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
796 |
+
->UseRealTime();
|
797 |
+
BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
|
798 |
+
xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
|
799 |
+
nullptr,
|
800 |
+
benchmark::utils::CheckNEONFP16ARITH)
|
801 |
+
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
|
802 |
+
->UseRealTime();
|
803 |
+
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
|
804 |
+
|
805 |
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
|
806 |
+
BENCHMARK_MAIN();
|
807 |
+
#endif
|