Androidonnxfork commited on
Commit
8b7c501
1 Parent(s): 842b645

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.bazelrc ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic build settings
2
+ build --jobs 128
3
+ build --cxxopt='-std=gnu++14'
4
+
5
+ # Sets the default Apple platform to macOS.
6
+ build --apple_platform_type=macos
7
+
8
+ # Android configs.
9
+ build:android --crosstool_top=//external:android/crosstool
10
+ build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
11
+ build:android --linkopt=-ldl
12
+ build:android --linkopt=-Wl,--gc-sections
13
+
14
+ build:android_arm --config=android
15
+ build:android_arm --cpu=armeabi-v7a
16
+ build:android_arm --fat_apk_cpu=armeabi-v7a
17
+
18
+ build:android_arm64 --config=android
19
+ build:android_arm64 --cpu=arm64-v8a
20
+ build:android_arm64 --fat_apk_cpu=arm64-v8a
21
+
22
+ # iOS configs.
23
+ build:ios --apple_platform_type=ios
24
+
25
+ build:ios_i386 --config=ios
26
+ build:ios_i386 --cpu=ios_i386
27
+ build:ios_i386 --watchos_cpus=i386
28
+
29
+ build:ios_x86_64 --config=ios
30
+ build:ios_x86_64 --cpu=ios_x86_64
31
+ build:ios_x86_64 --watchos_cpus=i386
32
+
33
+ build:ios_armv7 --config=ios
34
+ build:ios_armv7 --cpu=ios_armv7
35
+ build:ios_armv7 --watchos_cpus=armv7k
36
+
37
+ build:ios_arm64 --config=ios
38
+ build:ios_arm64 --cpu=ios_arm64
39
+ build:ios_arm64 --watchos_cpus=armv7k
40
+
41
+ build:ios_arm64e --config=ios
42
+ build:ios_arm64e --cpu=ios_arm64e
43
+ build:ios_arm64e --watchos_cpus=armv7k
44
+
45
+ build:ios_sim_arm64 --config=ios
46
+ build:ios_sim_arm64 --cpu=ios_sim_arm64
47
+ build:ios_sim_arm64 --watchos_cpus=armv7k
48
+
49
+ build:ios_fat --config=ios
50
+ build:ios_fat --ios_multi_cpus=armv7,arm64
51
+ build:ios_fat --watchos_cpus=armv7k
52
+
53
+ # macOS configs.
54
+ build:macos --apple_platform_type=macos
55
+
56
+ build:macos_arm64 --config=macos
57
+ build:macos_arm64 --cpu=darwin_arm64
.clang-format ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AllowShortFunctionsOnASingleLine: Inline
2
+ PackConstructorInitializers: Never
3
+ ColumnLimit: 120
4
+ AlignAfterOpenBracket: AlwaysBreak
5
+ BinPackParameters: false
6
+ AllowAllParametersOfDeclarationOnNextLine: true
7
+ BreakBeforeBraces: Stroustrup
8
+ SpaceAfterCStyleCast: true
9
+ PointerAlignment: Left
10
+ ForEachMacros: ['XNN_UNPREDICTABLE', 'XNN_LIKELY', 'XNN_UNLIKELY']
11
+ IfMacros: ['IF']
12
+ IndentCaseLabels: true
13
+ ContinuationIndentWidth: 2
14
+ SpaceBeforeParens: Custom
15
+ SpaceBeforeParensOptions:
16
+ AfterControlStatements: true
17
+ AfterIfMacros: true
18
+ AfterForeachMacros: false
19
+ SpacesBeforeTrailingComments: 2
20
+ IncludeBlocks: Regroup
21
+ IncludeCategories:
22
+ - Regex: '<xnnpack[./][[:alnum:].-]+>' # match XNNPack includes first
23
+ Priority: 5
24
+ - Regex: 'benchmark.h' # includes used in benchmarks
25
+ Priority: 3
26
+ - Regex: 'bench/' # includes used in benchmarks
27
+ Priority: 3
28
+ - Regex: 'gtest.h' # includes used in tests
29
+ Priority: 3
30
+ - Regex: 'gmock.h' # includes used in tests
31
+ Priority: 3
32
+ - Regex: '<[[:alnum:].]+>' # system headers
33
+ Priority: 2 # lower priority to keep it sorted first before XNNPack includes
34
+ MaxEmptyLinesToKeep: 2 # used to separate includes from functions
.gitattributes CHANGED
@@ -121,3 +121,5 @@ fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/m
121
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
122
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
123
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
 
 
 
121
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
122
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
123
  fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
124
+ build/CMakeFiles/microkernels-all.dir/build.make filter=lfs diff=lfs merge=lfs -text
125
+ build/libXNNPACK.a filter=lfs diff=lfs merge=lfs -text
.github/workflows/build.yml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build using CMake
2
+ on:
3
+ push:
4
+ paths:
5
+ - '**.S'
6
+ - '**.c'
7
+ - '**.cc'
8
+ - '**.h'
9
+ - 'CMakeLists.txt'
10
+ - 'cmake/**'
11
+ - 'scripts/build-*.sh'
12
+ - '.github/**/*.yml'
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
15
+ cancel-in-progress: true
16
+ jobs:
17
+ cmake-linux-local:
18
+ runs-on: ubuntu-latest
19
+ timeout-minutes: 60
20
+ steps:
21
+ - uses: actions/checkout@v3
22
+ - name: Update apt
23
+ run: sudo apt update
24
+ - name: Install ninja
25
+ run: sudo apt install ninja-build
26
+ - name: Configure and build
27
+ run: scripts/build-local.sh
28
+ working-directory: ${{ github.workspace }}
29
+ cmake-linux-aarch64:
30
+ runs-on: ubuntu-22.04
31
+ timeout-minutes: 120
32
+ steps:
33
+ - uses: actions/checkout@v3
34
+ - name: Update apt
35
+ run: sudo apt update
36
+ - name: Install ninja
37
+ run: sudo apt install ninja-build
38
+ - name: Install aarch64 cross-toolchain
39
+ run: sudo apt install crossbuild-essential-arm64
40
+ - name: Install qemu-aarch64
41
+ run: sudo apt install qemu-user
42
+ - name: Configure and build
43
+ run: scripts/build-linux-aarch64.sh -DCMAKE_BUILD_TYPE=Release
44
+ working-directory: ${{ github.workspace }}
45
+ - name: Run tests
46
+ run: ctest --output-on-failure --parallel $(nproc)
47
+ working-directory: ${{ github.workspace }}/build/linux/aarch64
48
+ cmake-linux-armhf:
49
+ runs-on: ubuntu-22.04
50
+ timeout-minutes: 90
51
+ steps:
52
+ - uses: actions/checkout@v3
53
+ - name: Update apt
54
+ run: sudo apt update
55
+ - name: Install ninja
56
+ run: sudo apt install ninja-build
57
+ - name: Install armhf cross-toolchain
58
+ run: sudo apt install crossbuild-essential-armhf
59
+ - name: Install qemu-arm
60
+ run: sudo apt install qemu-user
61
+ - name: Configure and build
62
+ run: scripts/build-linux-armhf.sh -DCMAKE_BUILD_TYPE=Release
63
+ working-directory: ${{ github.workspace }}
64
+ - name: Run tests
65
+ run: ctest --output-on-failure --parallel $(nproc)
66
+ working-directory: ${{ github.workspace }}/build/linux/armhf
67
+ cmake-linux-riscv64:
68
+ runs-on: ubuntu-22.04
69
+ timeout-minutes: 60
70
+ steps:
71
+ - uses: actions/checkout@v3
72
+ - name: Update apt
73
+ run: sudo apt update
74
+ - name: Install ninja
75
+ run: sudo apt install ninja-build
76
+ - name: Install riscv64 cross-toolchain
77
+ run: sudo apt install crossbuild-essential-riscv64
78
+ - name: Install qemu-riscv64
79
+ run: sudo apt install qemu-user
80
+ - name: Configure and build
81
+ run: scripts/build-linux-riscv64.sh -DCMAKE_BUILD_TYPE=Release -DXNNPACK_ENABLE_RISCV_VECTOR=OFF
82
+ working-directory: ${{ github.workspace }}
83
+ - name: Run tests
84
+ run: ctest --output-on-failure --parallel $(nproc)
85
+ working-directory: ${{ github.workspace }}/build/linux/riscv64
86
+ cmake-windows-arm64:
87
+ runs-on: windows-latest
88
+ timeout-minutes: 120
89
+ steps:
90
+ - uses: actions/checkout@v3
91
+ - name: Configure and build
92
+ run: scripts/build-windows-arm64.cmd
93
+ shell: cmd
94
+ working-directory: ${{ github.workspace }}
95
+ cmake-windows-x64:
96
+ runs-on: windows-latest
97
+ timeout-minutes: 120
98
+ steps:
99
+ - uses: actions/checkout@v3
100
+ - name: Configure and build
101
+ run: scripts/build-windows-x64.cmd
102
+ shell: cmd
103
+ working-directory: ${{ github.workspace }}
104
+ env:
105
+ CFLAGS: "/UNDEBUG"
106
+ CXXFLAGS: "/UNDEBUG"
107
+ - name: Run tests
108
+ run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
109
+ working-directory: ${{ github.workspace }}/build/windows/x64
110
+ cmake-windows-x86:
111
+ runs-on: windows-latest
112
+ timeout-minutes: 120
113
+ steps:
114
+ - uses: actions/checkout@v3
115
+ - name: Configure and build
116
+ run: scripts/build-windows-x86.cmd
117
+ shell: cmd
118
+ working-directory: ${{ github.workspace }}
119
+ env:
120
+ CFLAGS: "/UNDEBUG"
121
+ CXXFLAGS: "/UNDEBUG"
122
+ - name: Run tests
123
+ run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
124
+ working-directory: ${{ github.workspace }}/build/windows/x86
125
+ cmake-macos-arm64:
126
+ runs-on: macos-latest
127
+ timeout-minutes: 60
128
+ steps:
129
+ - uses: actions/checkout@v3
130
+ - name: Create output directory
131
+ run: mkdir -p build/macos/arm64
132
+ working-directory: ${{ github.workspace }}
133
+ - name: Generate CMake project
134
+ run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=arm64 -DHAVE_STD_REGEX=TRUE ../../..
135
+ working-directory: ${{ github.workspace }}/build/macos/arm64
136
+ - name: Build with Xcode
137
+ run: cmake --build build/macos/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
138
+ working-directory: ${{ github.workspace }}
139
+ cmake-macos-x86_64:
140
+ runs-on: macos-latest
141
+ timeout-minutes: 90
142
+ steps:
143
+ - uses: actions/checkout@v3
144
+ - name: Create output directory
145
+ run: mkdir -p build/macos/x86_64
146
+ working-directory: ${{ github.workspace }}
147
+ - name: Generate CMake project
148
+ run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=x86_64 -DHAVE_STD_REGEX=TRUE ../../..
149
+ working-directory: ${{ github.workspace }}/build/macos/x86_64
150
+ - name: Build with Xcode
151
+ run: cmake --build build/macos/x86_64 --parallel $(sysctl -n hw.ncpu) -- -quiet
152
+ working-directory: ${{ github.workspace }}
153
+ - name: Run tests
154
+ run: ctest --build-config Debug --output-on-failure --parallel $(sysctl -n hw.ncpu)
155
+ working-directory: ${{ github.workspace }}/build/macos/x86_64
156
+ cmake-android:
157
+ strategy:
158
+ matrix:
159
+ script: [build-android-arm64.sh, build-android-armv7.sh, build-android-x86.sh]
160
+ runs-on: ubuntu-latest
161
+ timeout-minutes: 40
162
+ steps:
163
+ - uses: actions/checkout@v3
164
+ - name: Update apt
165
+ run: sudo apt update
166
+ - name: Install ninja
167
+ run: sudo apt install ninja-build
168
+ - name: Setup Android NDK
169
+ id: setup-ndk
170
+ uses: nttld/setup-ndk@v1
171
+ with:
172
+ ndk-version: r23b
173
+ add-to-path: false
174
+ - name: Configure and build
175
+ run: scripts/${{ matrix.script }}
176
+ working-directory: ${{ github.workspace }}
177
+ env:
178
+ ANDROID_NDK: ${{ steps.setup-ndk.outputs.ndk-path }}
179
+ cmake-ios-arm64:
180
+ runs-on: macos-latest
181
+ timeout-minutes: 60
182
+ steps:
183
+ - uses: actions/checkout@v3
184
+ - name: Create output directory
185
+ run: mkdir -p build/ios/arm64
186
+ working-directory: ${{ github.workspace }}
187
+ - name: Generate CMake project
188
+ run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=arm64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
189
+ working-directory: ${{ github.workspace }}/build/ios/arm64
190
+ - name: Build with Xcode
191
+ run: cmake --build build/ios/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
192
+ working-directory: ${{ github.workspace }}
193
+ cmake-ios-x86_64:
194
+ runs-on: macos-latest
195
+ timeout-minutes: 60
196
+ steps:
197
+ - uses: actions/checkout@v3
198
+ - name: Create output directory
199
+ run: mkdir -p build/ios/x86_64
200
+ working-directory: ${{ github.workspace }}
201
+ - name: Generate CMake project
202
+ run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=x86_64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
203
+ working-directory: ${{ github.workspace }}/build/ios/x86_64
204
+ - name: Build with Xcode
205
+ run: cmake --build build/ios/x86_64 --parallel $(sysctl -n hw.ncpu) -- -sdk iphonesimulator -quiet
206
+ working-directory: ${{ github.workspace }}
207
+
.gitignore CHANGED
@@ -1,15 +1,35 @@
1
- *.iml
2
- .gradle
3
- /local.properties
4
- /.idea/caches
5
- /.idea/libraries
6
- /.idea/modules.xml
7
- /.idea/workspace.xml
8
- /.idea/navEditor.xml
9
- /.idea/assetWizardSettings.xml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  .DS_Store
11
- /build
12
- /captures
13
- .externalNativeBuild
14
- .cxx
15
- local.properties
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # Copyright 2019 Google LLC
5
+ #
6
+ # This source code is licensed under the BSD-style license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+
9
+ # Build objects and artifacts
10
+ bazel-bin
11
+ bazel-genfiles
12
+ bazel-out
13
+ bazel-testlogs
14
+ bazel-XNNPACK
15
+ bin/
16
+ build/
17
+ build-*/
18
+ deps/
19
+ lib/
20
+ libs/
21
+ obj/
22
+ out/
23
+ *.pyc
24
+ *.pyo
25
+ *.log
26
+
27
+ # System files
28
  .DS_Store
29
+ .DS_Store?
30
+ ._*
31
+ .Spotlight-V100
32
+ .Trashes
33
+ ehthumbs.db
34
+ Thumbs.db
35
+ *.swp
BUILD.bazel ADDED
The diff for this file is too large to render. See raw diff
 
CMakeLists.txt ADDED
The diff for this file is too large to render. See raw diff
 
CONTRIBUTING.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Contribute
2
+
3
+ We'd love to accept your patches and contributions to this project. There are
4
+ just a few small guidelines you need to follow.
5
+
6
+ ## Contributor License Agreement
7
+
8
+ Contributions to this project must be accompanied by a Contributor License
9
+ Agreement. You (or your employer) retain the copyright to your contribution;
10
+ this simply gives us permission to use and redistribute your contributions as
11
+ part of the project. Head over to <https://cla.developers.google.com/> to see
12
+ your current agreements on file or to sign a new one.
13
+
14
+ You generally only need to submit a CLA once, so if you've already submitted one
15
+ (even if it was for a different project), you probably don't need to do it
16
+ again.
17
+
18
+ ## Code reviews
19
+
20
+ All submissions, including submissions by project members, require review. We
21
+ use GitHub pull requests for this purpose. Consult
22
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23
+ information on using pull requests.
24
+
25
+ ## Community Guidelines
26
+
27
+ This project follows [Google's Open Source Community
28
+ Guidelines](https://opensource.google.com/conduct/).
LICENSE ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD License
2
+
3
+ For XNNPACK software
4
+
5
+ Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
6
+ Copyright 2019 Google LLC
7
+
8
+ Redistribution and use in source and binary forms, with or without modification,
9
+ are permitted provided that the following conditions are met:
10
+
11
+ * Redistributions of source code must retain the above copyright notice, this
12
+ list of conditions and the following disclaimer.
13
+
14
+ * Redistributions in binary form must reproduce the above copyright notice,
15
+ this list of conditions and the following disclaimer in the documentation
16
+ and/or other materials provided with the distribution.
17
+
18
+ * Neither the name Facebook nor the names of its contributors may be used to
19
+ endorse or promote products derived from this software without specific
20
+ prior written permission.
21
+
22
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
26
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md CHANGED
@@ -1,12 +1,130 @@
1
- ---
2
- title: Test
3
- emoji: 📊
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 3.38.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # XNNPACK
2
+
3
+ XNNPACK is a highly optimized solution for neural network inference on ARM, x86, WebAssembly, and RISC-V platforms. XNNPACK is not intended for direct use by deep learning practitioners and researchers; instead it provides low-level performance primitives for accelerating high-level machine learning frameworks, such as [TensorFlow Lite](https://www.tensorflow.org/lite), [TensorFlow.js](https://www.tensorflow.org/js), [PyTorch](https://pytorch.org/), [ONNX Runtime](https://onnxruntime.ai), and [MediaPipe](https://mediapipe.dev).
4
+
5
+ ## Supported Architectures
6
+
7
+ - ARM64 on Android, iOS, macOS, Linux, and Windows
8
+ - ARMv7 (with NEON) on Android
9
+ - ARMv6 (with VFPv2) on Linux
10
+ - x86 and x86-64 (up to AVX512) on Windows, Linux, macOS, Android, and iOS simulator
11
+ - WebAssembly MVP
12
+ - WebAssembly SIMD
13
+ - [WebAssembly Relaxed SIMD](https://github.com/WebAssembly/relaxed-simd) (experimental)
14
+ - RISC-V (RV32GC and RV64GC)
15
+
16
+ ## Operator Coverage
17
+
18
+ XNNPACK implements the following neural network operators:
19
+
20
+ - 2D Convolution (including grouped and depthwise)
21
+ - 2D Deconvolution (AKA Transposed Convolution)
22
+ - 2D Average Pooling
23
+ - 2D Max Pooling
24
+ - 2D ArgMax Pooling (Max Pooling + indices)
25
+ - 2D Unpooling
26
+ - 2D Bilinear Resize
27
+ - 2D Depth-to-Space (AKA Pixel Shuffle)
28
+ - Add (including broadcasting, two inputs only)
29
+ - Subtract (including broadcasting)
30
+ - Divide (including broadcasting)
31
+ - Maximum (including broadcasting)
32
+ - Minimum (including broadcasting)
33
+ - Multiply (including broadcasting)
34
+ - Squared Difference (including broadcasting)
35
+ - Global Average Pooling
36
+ - Channel Shuffle
37
+ - Fully Connected
38
+ - Abs (absolute value)
39
+ - Bankers' Rounding (rounding to nearest, ties to even)
40
+ - Ceiling (rounding to integer above)
41
+ - Clamp (includes ReLU and ReLU6)
42
+ - Convert (includes fixed-point and half-precision quantization and
43
+ dequantization)
44
+ - Copy
45
+ - ELU
46
+ - Floor (rounding to integer below)
47
+ - HardSwish
48
+ - Leaky ReLU
49
+ - Negate
50
+ - Sigmoid
51
+ - Softmax
52
+ - Square
53
+ - Tanh
54
+ - Transpose
55
+ - Truncation (rounding to integer towards zero)
56
+ - PReLU
57
+
58
+ All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the **C**hannel dimension. Thus, operators can consume a subset of channels in the input tensor, and produce a subset of channels in the output tensor, providing a zero-cost Channel Split and Channel Concatenation operations.
59
+
60
+ ## Performance
61
+
62
+ ### Mobile phones
63
+
64
+ The table below presents **single-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
65
+
66
+ | Model | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
67
+ | ----------------------- | :-------: | :---------: | :----------: |
68
+ | FP32 MobileNet v1 1.0X | 82 | 86 | 88 |
69
+ | FP32 MobileNet v2 1.0X | 49 | 53 | 55 |
70
+ | FP32 MobileNet v3 Large | 39 | 42 | 44 |
71
+ | FP32 MobileNet v3 Small | 12 | 14 | 14 |
72
+
73
+ The following table presents **multi-threaded** (using as many threads as there are big cores) performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
74
+
75
+ | Model | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
76
+ | ----------------------- | :-------: | :---------: | :----------: |
77
+ | FP32 MobileNet v1 1.0X | 43 | 27 | 46 |
78
+ | FP32 MobileNet v2 1.0X | 26 | 18 | 28 |
79
+ | FP32 MobileNet v3 Large | 22 | 16 | 24 |
80
+ | FP32 MobileNet v3 Small | 7 | 6 | 8 |
81
+
82
+ Benchmarked on March 27, 2020 with `end2end_bench --benchmark_min_time=5` on an Android/ARM64 build with Android NDK r21 (`bazel build -c opt --config android_arm64 :end2end_bench`) and neural network models with randomized weights and inputs.
83
+
84
+ ### Raspberry Pi
85
+
86
+ The table below presents **multi-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Raspberry Pi boards.
87
+
88
+ | Model | RPi Zero W (BCM2835), ms | RPi 2 (BCM2836), ms | RPi 3+ (BCM2837B0), ms | RPi 4 (BCM2711), ms | RPi 4 (BCM2711, ARM64), ms |
89
+ | ----------------------- | :----------------------: | :-----------------: | :--------------------: | :-----------------: | :------------------------: |
90
+ | FP32 MobileNet v1 1.0X | 3919 | 302 | 114 | 72 | 77 |
91
+ | FP32 MobileNet v2 1.0X | 1987 | 191 | 79 | 41 | 46 |
92
+ | FP32 MobileNet v3 Large | 1658 | 161 | 67 | 38 | 40 |
93
+ | FP32 MobileNet v3 Small | 474 | 50 | 22 | 13 | 15 |
94
+ | INT8 MobileNet v1 1.0X | 2589 | 128 | 46 | 29 | 24 |
95
+ | INT8 MobileNet v2 1.0X | 1495 | 82 | 30 | 20 | 17 |
96
+
97
+ Benchmarked on Feb 8, 2022 with `end2end-bench --benchmark_min_time=5` on a Raspbian Buster build with CMake (`./scripts/build-local.sh`) and neural network models with randomized weights and inputs. INT8 inference was evaluated on per-channel quantization schema.
98
+
99
+ ## Minimum build requirements
100
+
101
+ - C11
102
+ - C++14
103
+ - Python 3
104
+
105
+ ## Publications
106
+
107
+ - Marat Dukhan "The Indirect Convolution Algorithm". Presented on [Efficient Deep Learning for Compute Vision (ECV) 2019](https://sites.google.com/corp/view/ecv2019/) workshop ([slides](https://drive.google.com/file/d/1ZayB3By5ZxxQIRtN7UDq_JvPg1IYd3Ac/view), [paper on ArXiv](https://arxiv.org/abs/1907.02129)).
108
+ - Erich Elsen, Marat Dukhan, Trevor Gale, Karen Simonyan "Fast Sparse ConvNets".
109
+ [Paper on ArXiv](https://arxiv.org/abs/1911.09723), [pre-trained sparse
110
+ models](https://github.com/google-research/google-research/tree/master/fastconvnets).
111
+ - Marat Dukhan, Artsiom Ablavatski "The Two-Pass Softmax Algorithm".
112
+ [Paper on ArXiv](https://arxiv.org/abs/2001.04438).
113
+ - Yury Pisarchyk, Juhyun Lee "Efficient Memory Management for Deep Neural Net Inference".
114
+ [Paper on ArXiv](https://arxiv.org/abs/2001.03288).
115
+
116
+ ## Ecosystem
117
+
118
+ ### Machine Learning Frameworks
119
+
120
+ - [TensorFlow Lite](https://blog.tensorflow.org/2020/07/accelerating-tensorflow-lite-xnnpack-integration.html).
121
+ - [TensorFlow.js WebAssembly backend](https://blog.tensorflow.org/2020/03/introducing-webassembly-backend-for-tensorflow-js.html).
122
+ - [PyTorch Mobile](https://pytorch.org/mobile).
123
+ - [ONNX Runtime Mobile](https://onnxruntime.ai/docs/execution-providers/Xnnpack-ExecutionProvider.html)
124
+ - [MediaPipe for the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html).
125
+ - [Alibaba HALO (Heterogeneity-Aware Lowering and Optimization)](https://github.com/alibaba/heterogeneity-aware-lowering-and-optimization)
126
+ - [Samsung ONE (On-device Neural Engine)](https://github.com/Samsung/ONE)
127
+
128
+ ## Acknowledgements
129
+
130
+ XNNPACK is a based on [QNNPACK](https://github.com/pytorch/QNNPACK) library. Over time its codebase diverged a lot, and XNNPACK API is no longer compatible with QNNPACK.
WORKSPACE ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ workspace(name = "xnnpack")
2
+
3
+ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
4
+
5
+ # Bazel rule definitions
6
+ http_archive(
7
+ name = "rules_cc",
8
+ strip_prefix = "rules_cc-main",
9
+ urls = ["https://github.com/bazelbuild/rules_cc/archive/main.zip"],
10
+ )
11
+
12
+ # Bazel Skylib.
13
+ http_archive(
14
+ name = "bazel_skylib",
15
+ sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
16
+ urls = [
17
+ "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
18
+ "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
19
+ ],
20
+ )
21
+
22
+ # Google Test framework, used by most unit-tests.
23
+ http_archive(
24
+ name = "com_google_googletest",
25
+ sha256 = "5cb522f1427558c6df572d6d0e1bf0fd076428633d080e88ad5312be0b6a8859",
26
+ strip_prefix = "googletest-e23cdb78e9fef1f69a9ef917f447add5638daf2a",
27
+ urls = ["https://github.com/google/googletest/archive/e23cdb78e9fef1f69a9ef917f447add5638daf2a.zip"],
28
+ )
29
+
30
+ # Google Benchmark library, used in micro-benchmarks.
31
+ http_archive(
32
+ name = "com_google_benchmark",
33
+ sha256 = "1ba14374fddcd9623f126b1a60945e4deac4cdc4fb25a5f25e7f779e36f2db52",
34
+ strip_prefix = "benchmark-d2a8a4ee41b923876c034afb939c4fc03598e622",
35
+ urls = ["https://github.com/google/benchmark/archive/d2a8a4ee41b923876c034afb939c4fc03598e622.zip"],
36
+ )
37
+
38
+ # FP16 library, used for half-precision conversions
39
+ http_archive(
40
+ name = "FP16",
41
+ build_file = "@//third_party:FP16.BUILD",
42
+ sha256 = "e66e65515fa09927b348d3d584c68be4215cfe664100d01c9dbc7655a5716d70",
43
+ strip_prefix = "FP16-0a92994d729ff76a58f692d3028ca1b64b145d91",
44
+ urls = [
45
+ "https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip",
46
+ ],
47
+ )
48
+
49
+ # FXdiv library, used for repeated integer division by the same factor
50
+ http_archive(
51
+ name = "FXdiv",
52
+ sha256 = "ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d",
53
+ strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1",
54
+ urls = ["https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip"],
55
+ )
56
+
57
+ # pthreadpool library, used for parallelization
58
+ http_archive(
59
+ name = "pthreadpool",
60
+ sha256 = "e6370550a1abf1503daf3c2c196e0a1c2b253440c39e1a57740ff49af2d8bedf",
61
+ strip_prefix = "pthreadpool-43edadc654d6283b4b6e45ba09a853181ae8e850",
62
+ urls = ["https://github.com/Maratyszcza/pthreadpool/archive/43edadc654d6283b4b6e45ba09a853181ae8e850.zip"],
63
+ )
64
+
65
+ # cpuinfo library, used for detecting processor characteristics
66
+ http_archive(
67
+ name = "cpuinfo",
68
+ sha256 = "609fc42c47482c1fc125dccac65e843f640e792540162581c4b7eb6ff81c826a",
69
+ strip_prefix = "cpuinfo-87d8234510367db49a65535021af5e1838a65ac2",
70
+ urls = [
71
+ "https://github.com/pytorch/cpuinfo/archive/87d8234510367db49a65535021af5e1838a65ac2.zip",
72
+ ],
73
+ )
74
+
75
+ # Ruy library, used to benchmark against
76
+ http_archive(
77
+ name = "ruy",
78
+ sha256 = "fe8345f521bb378745ebdd0f8c5937414849936851d2ec2609774eb2d7098e54",
79
+ strip_prefix = "ruy-9f53ba413e6fc879236dcaa3e008915973d67a4f",
80
+ urls = [
81
+ "https://github.com/google/ruy/archive/9f53ba413e6fc879236dcaa3e008915973d67a4f.zip",
82
+ ],
83
+ )
84
+
85
+ # Android NDK location and version is auto-detected from $ANDROID_NDK_HOME environment variable
86
+ android_ndk_repository(name = "androidndk")
87
+
88
+ # Android SDK location and API is auto-detected from $ANDROID_HOME environment variable
89
+ android_sdk_repository(name = "androidsdk")
bench/abs.cc ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2021 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <limits>
11
+ #include <memory>
12
+ #include <random>
13
+ #include <vector>
14
+
15
+ #include <fp16/fp16.h>
16
+
17
+ #include <xnnpack.h>
18
+
19
+ #include <benchmark/benchmark.h>
20
+ #include "bench/utils.h"
21
+ #ifdef BENCHMARK_TENSORFLOW_LITE
22
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23
+ #include "tensorflow/lite/interpreter.h"
24
+ #include "tensorflow/lite/kernels/register.h"
25
+ #include "tensorflow/lite/model.h"
26
+ #include "tensorflow/lite/schema/schema_generated.h"
27
+ #include "tensorflow/lite/version.h"
28
+ #endif // BENCHMARK_TENSORFLOW_LITE
29
+
30
+
31
+ static void xnnpack_abs_f16(benchmark::State& state) {
32
+ const size_t batch_size = state.range(0);
33
+
34
+ std::random_device random_device;
35
+ auto rng = std::mt19937(random_device());
36
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
37
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
38
+
39
+ std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
40
+ std::vector<uint16_t> output(batch_size);
41
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
42
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
43
+
44
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
45
+ if (status != xnn_status_success) {
46
+ state.SkipWithError("failed to initialize XNNPACK");
47
+ return;
48
+ }
49
+
50
+ xnn_operator_t abs_op = nullptr;
51
+ status = xnn_create_abs_nc_f16(
52
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
53
+ 0 /* flags */, &abs_op);
54
+ if (status != xnn_status_success || abs_op == nullptr) {
55
+ state.SkipWithError("failed to create Abs operator");
56
+ return;
57
+ }
58
+
59
+ status = xnn_reshape_abs_nc_f16(abs_op, batch_size, /*threadpool=*/nullptr);
60
+ if (status != xnn_status_success) {
61
+ state.SkipWithError("failed to reshape Abs operator");
62
+ return;
63
+ }
64
+
65
+ status = xnn_setup_abs_nc_f16(abs_op, input.data(), output.data());
66
+ if (status != xnn_status_success) {
67
+ state.SkipWithError("failed to setup Abs operator");
68
+ return;
69
+ }
70
+
71
+ for (auto _ : state) {
72
+ status = xnn_run_operator(abs_op, nullptr /* thread pool */);
73
+ if (status != xnn_status_success) {
74
+ state.SkipWithError("failed to run Abs operator");
75
+ return;
76
+ }
77
+ }
78
+
79
+ status = xnn_delete_operator(abs_op);
80
+ if (status != xnn_status_success) {
81
+ state.SkipWithError("failed to delete Abs operator");
82
+ return;
83
+ }
84
+
85
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86
+ if (cpu_frequency != 0) {
87
+ state.counters["cpufreq"] = cpu_frequency;
88
+ }
89
+
90
+ state.counters["elements"] =
91
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
92
+
93
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
94
+ state.counters["bytes"] =
95
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
96
+ }
97
+
98
+ static void xnnpack_abs_f32(benchmark::State& state) {
99
+ const size_t batch_size = state.range(0);
100
+
101
+ std::random_device random_device;
102
+ auto rng = std::mt19937(random_device());
103
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
104
+
105
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
106
+ std::vector<float> output(batch_size);
107
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
108
+ std::fill(output.begin(), output.end(), std::nanf(""));
109
+
110
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
111
+ if (status != xnn_status_success) {
112
+ state.SkipWithError("failed to initialize XNNPACK");
113
+ return;
114
+ }
115
+
116
+ xnn_operator_t abs_op = nullptr;
117
+ status = xnn_create_abs_nc_f32(
118
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
119
+ 0 /* flags */, &abs_op);
120
+ if (status != xnn_status_success || abs_op == nullptr) {
121
+ state.SkipWithError("failed to create Abs operator");
122
+ return;
123
+ }
124
+
125
+ status = xnn_reshape_abs_nc_f32(abs_op, batch_size, /*threadpool=*/nullptr);
126
+ if (status != xnn_status_success) {
127
+ state.SkipWithError("failed to reshape Abs operator");
128
+ return;
129
+ }
130
+
131
+ status = xnn_setup_abs_nc_f32(abs_op, input.data(), output.data());
132
+ if (status != xnn_status_success) {
133
+ state.SkipWithError("failed to setup Abs operator");
134
+ return;
135
+ }
136
+
137
+ for (auto _ : state) {
138
+ status = xnn_run_operator(abs_op, nullptr /* thread pool */);
139
+ if (status != xnn_status_success) {
140
+ state.SkipWithError("failed to run Abs operator");
141
+ return;
142
+ }
143
+ }
144
+
145
+ status = xnn_delete_operator(abs_op);
146
+ if (status != xnn_status_success) {
147
+ state.SkipWithError("failed to delete Abs operator");
148
+ return;
149
+ }
150
+
151
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
152
+ if (cpu_frequency != 0) {
153
+ state.counters["cpufreq"] = cpu_frequency;
154
+ }
155
+
156
+ state.counters["elements"] =
157
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
158
+
159
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
160
+ state.counters["bytes"] =
161
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
162
+ }
163
+
164
+ #ifdef BENCHMARK_TENSORFLOW_LITE
165
+ static void tflite_abs_f32(benchmark::State& state) {
166
+ const size_t batch_size = state.range(0);
167
+
168
+ std::random_device random_device;
169
+ auto rng = std::mt19937(random_device());
170
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
171
+
172
+ flatbuffers::FlatBufferBuilder builder;
173
+ const flatbuffers::Offset<tflite::OperatorCode> operator_code =
174
+ CreateOperatorCode(builder, tflite::BuiltinOperator_ABS);
175
+
176
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
177
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
178
+ }};
179
+
180
+ const std::array<int32_t, 1> shape{{
181
+ static_cast<int32_t>(batch_size)
182
+ }};
183
+
184
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
185
+ tflite::CreateTensor(builder,
186
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
187
+ tflite::TensorType_FLOAT32),
188
+ tflite::CreateTensor(builder,
189
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
190
+ tflite::TensorType_FLOAT32),
191
+ }};
192
+
193
+ const std::array<int32_t, 1> op_inputs{{ 0 }};
194
+ const std::array<int32_t, 1> op_outputs{{ 1 }};
195
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
196
+ builder,
197
+ 0 /* opcode_index */,
198
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
199
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
200
+
201
+ const std::array<int32_t, 1> graph_inputs{{ 0 }};
202
+ const std::array<int32_t, 1> graph_outputs{{ 1 }};
203
+ const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
204
+ builder,
205
+ builder.CreateVector(tensors.data(), tensors.size()),
206
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
207
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
208
+ builder.CreateVector(&op, 1));
209
+
210
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
211
+ TFLITE_SCHEMA_VERSION,
212
+ builder.CreateVector(&operator_code, 1),
213
+ builder.CreateVector(&subgraph, 1),
214
+ builder.CreateString("Abs model"),
215
+ builder.CreateVector(buffers.data(), buffers.size()));
216
+
217
+ builder.Finish(model_buffer);
218
+
219
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
220
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
221
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
222
+ std::unique_ptr<tflite::Interpreter> interpreter;
223
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
224
+ state.SkipWithError("failed to create TFLite interpreter");
225
+ return;
226
+ }
227
+ interpreter->SetNumThreads(1);
228
+
229
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
230
+ state.SkipWithError("failed to allocate tensors");
231
+ return;
232
+ }
233
+
234
+ std::generate(
235
+ interpreter->typed_tensor<float>(0),
236
+ interpreter->typed_tensor<float>(0) + batch_size,
237
+ std::ref(f32rng));
238
+
239
+ for (auto _ : state) {
240
+ if (interpreter->Invoke() != kTfLiteOk) {
241
+ state.SkipWithError("failed to invoke TFLite interpreter");
242
+ return;
243
+ }
244
+ }
245
+
246
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
247
+ if (cpu_frequency != 0) {
248
+ state.counters["cpufreq"] = cpu_frequency;
249
+ }
250
+
251
+ state.counters["elements"] =
252
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
253
+
254
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
255
+ state.counters["bytes"] =
256
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
257
+
258
+ interpreter.reset();
259
+ }
260
+ #endif // BENCHMARK_TENSORFLOW_LITE
261
+
262
+ BENCHMARK(xnnpack_abs_f16)
263
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
264
+ ->UseRealTime();
265
+ BENCHMARK(xnnpack_abs_f32)
266
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267
+ ->UseRealTime();
268
+
269
+ #ifdef BENCHMARK_TENSORFLOW_LITE
270
+ BENCHMARK(tflite_abs_f32)
271
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
272
+ ->UseRealTime();
273
+ #endif // BENCHMARK_TENSORFLOW_LITE
274
+
275
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
276
+ BENCHMARK_MAIN();
277
+ #endif
bench/average-pooling.cc ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #include <algorithm>
10
+ #include <cfloat>
11
+ #include <cmath>
12
+ #include <functional>
13
+ #include <limits>
14
+ #include <memory>
15
+ #include <random>
16
+ #include <vector>
17
+
18
+ #include <xnnpack.h>
19
+
20
+ #include <benchmark/benchmark.h>
21
+ #ifdef BENCHMARK_TENSORFLOW_LITE
22
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23
+ #include "tensorflow/lite/interpreter.h"
24
+ #include "tensorflow/lite/kernels/register.h"
25
+ #include "tensorflow/lite/model.h"
26
+ #include "tensorflow/lite/schema/schema_generated.h"
27
+ #include "tensorflow/lite/version.h"
28
+ #endif // BENCHMARK_TENSORFLOW_LITE
29
+ #include "bench/utils.h"
30
+
31
+ static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
32
+ const size_t batch_size = state.range(0);
33
+ const size_t input_height = state.range(1);
34
+ const size_t input_width = state.range(2);
35
+ const size_t pooling_size = state.range(3);
36
+ const size_t padding_size = state.range(4);
37
+ const size_t stride = state.range(5);
38
+ const size_t channels = state.range(6);
39
+
40
+ std::random_device random_device;
41
+ auto rng = std::mt19937(random_device());
42
+ auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
43
+
44
+ const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
45
+ const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
46
+
47
+ std::vector<uint8_t> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(uint8_t));
48
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
49
+ std::vector<uint8_t> output(batch_size * output_height * output_width * channels);
50
+ std::fill(output.begin(), output.end(), 0xA5);
51
+
52
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
53
+ if (status != xnn_status_success) {
54
+ state.SkipWithError("failed to initialize XNNPACK");
55
+ return;
56
+ }
57
+
58
+ xnn_operator_t pooling_op = nullptr;
59
+ status = xnn_create_average_pooling2d_nhwc_qu8(
60
+ padding_size, padding_size, padding_size, padding_size,
61
+ pooling_size, pooling_size,
62
+ stride, stride,
63
+ channels, channels /* input pixel stride */, channels /* output pixel stride */,
64
+ 127 /* input zero point */, 0.75f /* input scale */,
65
+ 127 /* output zero point */, 1.25f /* output scale */,
66
+ 0, 255,
67
+ 0 /* flags */, &pooling_op);
68
+ if (status != xnn_status_success) {
69
+ state.SkipWithError("failed to create Average Pooling operator");
70
+ return;
71
+ }
72
+
73
+ status = xnn_reshape_average_pooling2d_nhwc_qu8(
74
+ pooling_op,
75
+ batch_size, input_height, input_width,
76
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
77
+ nullptr /* thread pool */);
78
+ if (status != xnn_status_success) {
79
+ state.SkipWithError("failed to reshape Average Pooling operator");
80
+ return;
81
+ }
82
+
83
+ status = xnn_setup_average_pooling2d_nhwc_qu8(
84
+ pooling_op,
85
+ input.data(), output.data());
86
+ if (status != xnn_status_success) {
87
+ state.SkipWithError("failed to setup Average Pooling operator");
88
+ return;
89
+ }
90
+
91
+ for (auto _ : state) {
92
+ status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
93
+ if (status != xnn_status_success) {
94
+ state.SkipWithError("failed to run Average Pooling operator");
95
+ return;
96
+ }
97
+ }
98
+
99
+ status = xnn_delete_operator(pooling_op);
100
+ if (status != xnn_status_success) {
101
+ state.SkipWithError("failed to delete Average Pooling operator");
102
+ return;
103
+ }
104
+ pooling_op = nullptr;
105
+
106
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
107
+ if (cpu_frequency != 0) {
108
+ state.counters["cpufreq"] = cpu_frequency;
109
+ }
110
+
111
+ state.counters["bytes"] = benchmark::Counter(
112
+ uint64_t(state.iterations()) *
113
+ batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(uint8_t),
114
+ benchmark::Counter::kIsRate);
115
+ }
116
+
117
+ static void xnnpack_average_pooling_f32(benchmark::State& state, const char* net) {
118
+ const size_t batch_size = state.range(0);
119
+ const size_t input_height = state.range(1);
120
+ const size_t input_width = state.range(2);
121
+ const size_t pooling_size = state.range(3);
122
+ const size_t padding_size = state.range(4);
123
+ const size_t stride = state.range(5);
124
+ const size_t channels = state.range(6);
125
+
126
+ std::random_device random_device;
127
+ auto rng = std::mt19937(random_device());
128
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
129
+
130
+ const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
131
+ const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
132
+
133
+ std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
134
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
135
+ std::vector<float> output(batch_size * output_height * output_width * channels);
136
+ std::fill(output.begin(), output.end(), std::nanf(""));
137
+
138
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
139
+ if (status != xnn_status_success) {
140
+ state.SkipWithError("failed to initialize XNNPACK");
141
+ return;
142
+ }
143
+
144
+ xnn_operator_t pooling_op = nullptr;
145
+ status = xnn_create_average_pooling2d_nhwc_f32(
146
+ padding_size, padding_size, padding_size, padding_size,
147
+ pooling_size, pooling_size,
148
+ stride, stride,
149
+ channels, channels /* input pixel stride */, channels /* output pixel stride */,
150
+ -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
151
+ 0 /* flags */, &pooling_op);
152
+ if (status != xnn_status_success) {
153
+ state.SkipWithError("failed to create Average Pooling operator");
154
+ return;
155
+ }
156
+
157
+ status = xnn_reshape_average_pooling2d_nhwc_f32(
158
+ pooling_op,
159
+ batch_size, input_height, input_width,
160
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
161
+ nullptr /* thread pool */);
162
+ if (status != xnn_status_success) {
163
+ state.SkipWithError("failed to reshape Average Pooling operator");
164
+ return;
165
+ }
166
+
167
+ status = xnn_setup_average_pooling2d_nhwc_f32(
168
+ pooling_op,
169
+ input.data(), output.data());
170
+ if (status != xnn_status_success) {
171
+ state.SkipWithError("failed to setup Average Pooling operator");
172
+ return;
173
+ }
174
+
175
+ for (auto _ : state) {
176
+ status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
177
+ if (status != xnn_status_success) {
178
+ state.SkipWithError("failed to run Average Pooling operator");
179
+ return;
180
+ }
181
+ }
182
+
183
+ status = xnn_delete_operator(pooling_op);
184
+ if (status != xnn_status_success) {
185
+ state.SkipWithError("failed to delete Average Pooling operator");
186
+ return;
187
+ }
188
+ pooling_op = nullptr;
189
+
190
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
191
+ if (cpu_frequency != 0) {
192
+ state.counters["cpufreq"] = cpu_frequency;
193
+ }
194
+
195
+ state.counters["bytes"] = benchmark::Counter(
196
+ uint64_t(state.iterations()) *
197
+ batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
198
+ benchmark::Counter::kIsRate);
199
+ }
200
+
201
+ #ifdef BENCHMARK_TENSORFLOW_LITE
202
+ void tflite_average_pooling_f32(benchmark::State& state, const char* net) {
203
+ const size_t batch_size = state.range(0);
204
+ const size_t input_height = state.range(1);
205
+ const size_t input_width = state.range(2);
206
+ const size_t pooling_size = state.range(3);
207
+ const size_t padding_size = state.range(4);
208
+ const size_t stride = state.range(5);
209
+ const size_t channels = state.range(6);
210
+
211
+ std::random_device random_device;
212
+ auto rng = std::mt19937(random_device());
213
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
214
+
215
+ tflite::Padding padding = tflite::Padding_VALID;
216
+ if (2 * padding_size == (pooling_size - 1)) {
217
+ padding = tflite::Padding_SAME;
218
+ } else if (padding_size == 0) {
219
+ padding = tflite::Padding_VALID;
220
+ } else {
221
+ state.SkipWithError("unsupported padding");
222
+ return;
223
+ }
224
+
225
+ const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
226
+ const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
227
+
228
+ std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
229
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
230
+ std::vector<float> output(batch_size * output_height * output_width * channels);
231
+ std::fill(output.begin(), output.end(), std::nanf(""));
232
+
233
+ flatbuffers::FlatBufferBuilder builder;
234
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
235
+ CreateOperatorCode(builder, tflite::BuiltinOperator_AVERAGE_POOL_2D);
236
+
237
+ flatbuffers::Offset<tflite::Pool2DOptions> pool2d_options = CreatePool2DOptions(
238
+ builder, padding,
239
+ stride /* stride_w */, stride /* stride_h */,
240
+ pooling_size /* filter_width */, pooling_size /* filter_height */,
241
+ tflite::ActivationFunctionType_NONE);
242
+
243
+ flatbuffers::Offset<tflite::Buffer> buffers[1] = {
244
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
245
+ };
246
+
247
+ const int32_t input_shape[4] = {
248
+ static_cast<int32_t>(batch_size),
249
+ static_cast<int32_t>(input_height),
250
+ static_cast<int32_t>(input_width),
251
+ static_cast<int32_t>(channels)
252
+ };
253
+ const int32_t output_shape[4] = {
254
+ static_cast<int32_t>(batch_size),
255
+ static_cast<int32_t>(output_height),
256
+ static_cast<int32_t>(output_width),
257
+ static_cast<int32_t>(channels)
258
+ };
259
+
260
+ flatbuffers::Offset<tflite::Tensor> tensors[2] = {
261
+ tflite::CreateTensor(builder,
262
+ builder.CreateVector<int32_t>(input_shape, 4),
263
+ tflite::TensorType_FLOAT32),
264
+ tflite::CreateTensor(builder,
265
+ builder.CreateVector<int32_t>(output_shape, 4),
266
+ tflite::TensorType_FLOAT32),
267
+ };
268
+
269
+ const int32_t op_inputs[1] = { 0 };
270
+ const int32_t op_outputs[1] = { 1 };
271
+ flatbuffers::Offset<tflite::Operator> op = CreateOperator(
272
+ builder,
273
+ 0 /* opcode_index */,
274
+ builder.CreateVector<int32_t>(op_inputs, 1),
275
+ builder.CreateVector<int32_t>(op_outputs, 1),
276
+ tflite::BuiltinOptions_Pool2DOptions,
277
+ pool2d_options.Union());
278
+
279
+ const int32_t graph_inputs[1] = { 0 };
280
+ const int32_t graph_outputs[1] = { 1 };
281
+ flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
282
+ builder,
283
+ builder.CreateVector(tensors, 2),
284
+ builder.CreateVector<int32_t>(graph_inputs, 1),
285
+ builder.CreateVector<int32_t>(graph_outputs, 1),
286
+ builder.CreateVector(&op, 1));
287
+
288
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
289
+ TFLITE_SCHEMA_VERSION,
290
+ builder.CreateVector(&operator_code, 1),
291
+ builder.CreateVector(&subgraph, 1),
292
+ builder.CreateString("AVERAGE_POOL_2D model"),
293
+ builder.CreateVector(buffers, 1));
294
+
295
+ builder.Finish(model_buffer);
296
+
297
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
298
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
299
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
300
+ std::unique_ptr<tflite::Interpreter> interpreter;
301
+ if (interpreterBuilder(&interpreter) != kTfLiteOk) {
302
+ state.SkipWithError("failed to create TFLite interpreter");
303
+ return;
304
+ }
305
+ if (interpreter == nullptr) {
306
+ state.SkipWithError("TFLite interpreter is null");
307
+ return;
308
+ }
309
+ interpreter->SetNumThreads(1);
310
+
311
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
312
+ state.SkipWithError("failed to allocate tensors");
313
+ return;
314
+ }
315
+
316
+ std::generate(
317
+ interpreter->typed_tensor<float>(0),
318
+ interpreter->typed_tensor<float>(0) + batch_size * input_height * input_width * channels,
319
+ std::ref(f32rng));
320
+
321
+ for (auto _ : state) {
322
+ if (interpreter->Invoke() != kTfLiteOk) {
323
+ state.SkipWithError("failed to invoke TFLite interpreter");
324
+ return;
325
+ }
326
+ }
327
+
328
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
329
+ if (cpu_frequency != 0) {
330
+ state.counters["cpufreq"] = cpu_frequency;
331
+ }
332
+
333
+ state.counters["bytes"] = benchmark::Counter(
334
+ uint64_t(state.iterations()) *
335
+ batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
336
+ benchmark::Counter::kIsRate);
337
+ }
338
+ #endif // BENCHMARK_TENSORFLOW_LITE
339
+
340
+ // Final global average pooling in ImageNet classification models.
341
+ static void ImageNet(benchmark::internal::Benchmark* b) {
342
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
343
+
344
+ /* N H W K P S C */
345
+ b->Args({1, 13, 13, 13, 0, 1, 1000});
346
+ b->Args({1, 7, 7, 7, 0, 1, 1000});
347
+ }
348
+
349
+ // ShuffleNet v1 with 1 group.
350
+ static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
351
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
352
+
353
+ /* N H W K P S C */
354
+ b->Args({1, 56, 56, 3, 1, 2, 24});
355
+ b->Args({1, 28, 28, 3, 1, 2, 144});
356
+ b->Args({1, 14, 14, 3, 1, 2, 288});
357
+ b->Args({1, 7, 7, 3, 1, 2, 576});
358
+ }
359
+
360
+ // ShuffleNet v1 with 2 groups.
361
+ static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
362
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
363
+
364
+ /* N H W K P S C */
365
+ b->Args({1, 56, 56, 3, 1, 2, 24});
366
+ b->Args({1, 28, 28, 3, 1, 2, 200});
367
+ b->Args({1, 14, 14, 3, 1, 2, 400});
368
+ b->Args({1, 7, 7, 3, 1, 2, 800});
369
+ }
370
+
371
+ // ShuffleNet v1 with 3 groups.
372
+ static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
373
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
374
+
375
+ /* N H W K P S C */
376
+ b->Args({1, 56, 56, 3, 1, 2, 24});
377
+ b->Args({1, 28, 28, 3, 1, 2, 240});
378
+ b->Args({1, 14, 14, 3, 1, 2, 480});
379
+ b->Args({1, 7, 7, 3, 1, 2, 960});
380
+ }
381
+
382
+ // ShuffleNet v1 with 4 groups.
383
+ static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
384
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
385
+
386
+ /* N H W K P S C */
387
+ b->Args({1, 56, 56, 3, 1, 2, 24});
388
+ b->Args({1, 28, 28, 3, 1, 2, 272});
389
+ b->Args({1, 14, 14, 3, 1, 2, 576});
390
+ b->Args({1, 7, 7, 3, 1, 2, 1088});
391
+ }
392
+
393
+ // ShuffleNet v1 with 8 groups.
394
+ static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
395
+ b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
396
+
397
+ /* N H W K P S C */
398
+ b->Args({1, 56, 56, 3, 1, 2, 24});
399
+ b->Args({1, 28, 28, 3, 1, 2, 384});
400
+ b->Args({1, 14, 14, 3, 1, 2, 768});
401
+ b->Args({1, 7, 7, 3, 1, 2, 1536});
402
+ }
403
+
404
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
405
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
406
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
407
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
408
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
409
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
410
+
411
+ #ifdef BENCHMARK_TENSORFLOW_LITE
412
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
413
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
414
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
415
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
416
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
417
+ BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
418
+ #endif // BENCHMARK_TENSORFLOW_LITE
419
+
420
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
421
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
422
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
423
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
424
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
425
+ BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
426
+
427
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
428
+ BENCHMARK_MAIN();
429
+ #endif
bench/bankers-rounding.cc ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2020 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <limits>
11
+ #include <memory>
12
+ #include <random>
13
+ #include <vector>
14
+
15
+ #include <fp16/fp16.h>
16
+
17
+ #include <xnnpack.h>
18
+
19
+ #include <benchmark/benchmark.h>
20
+ #include "bench/utils.h"
21
+ #ifdef BENCHMARK_TENSORFLOW_LITE
22
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23
+ #include "tensorflow/lite/interpreter.h"
24
+ #include "tensorflow/lite/kernels/register.h"
25
+ #include "tensorflow/lite/model.h"
26
+ #include "tensorflow/lite/schema/schema_generated.h"
27
+ #include "tensorflow/lite/version.h"
28
+ #endif // BENCHMARK_TENSORFLOW_LITE
29
+
30
+
31
+ static void xnnpack_bankers_rounding_f16(benchmark::State& state) {
32
+ const size_t batch_size = state.range(0);
33
+
34
+ std::random_device random_device;
35
+ auto rng = std::mt19937(random_device());
36
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
37
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
38
+
39
+ std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
40
+ std::vector<uint16_t> output(batch_size);
41
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
42
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
43
+
44
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
45
+ if (status != xnn_status_success) {
46
+ state.SkipWithError("failed to initialize XNNPACK");
47
+ return;
48
+ }
49
+
50
+ xnn_operator_t bankers_rounding_op = nullptr;
51
+ status = xnn_create_bankers_rounding_nc_f16(
52
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
53
+ 0 /* flags */, &bankers_rounding_op);
54
+ if (status != xnn_status_success || bankers_rounding_op == nullptr) {
55
+ state.SkipWithError("failed to create Bankers' Rounding operator");
56
+ return;
57
+ }
58
+
59
+ status = xnn_reshape_bankers_rounding_nc_f16(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
60
+ if (status != xnn_status_success) {
61
+ state.SkipWithError("failed to reshape Bankers' Rounding operator");
62
+ return;
63
+ }
64
+
65
+ status = xnn_setup_bankers_rounding_nc_f16(bankers_rounding_op, input.data(), output.data());
66
+ if (status != xnn_status_success) {
67
+ state.SkipWithError("failed to setup Bankers' Rounding operator");
68
+ return;
69
+ }
70
+
71
+ for (auto _ : state) {
72
+ status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
73
+ if (status != xnn_status_success) {
74
+ state.SkipWithError("failed to run Bankers' Rounding operator");
75
+ return;
76
+ }
77
+ }
78
+
79
+ status = xnn_delete_operator(bankers_rounding_op);
80
+ if (status != xnn_status_success) {
81
+ state.SkipWithError("failed to delete Bankers' Rounding operator");
82
+ return;
83
+ }
84
+
85
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86
+ if (cpu_frequency != 0) {
87
+ state.counters["cpufreq"] = cpu_frequency;
88
+ }
89
+
90
+ state.counters["elements"] =
91
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
92
+
93
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
94
+ state.counters["bytes"] =
95
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
96
+ }
97
+
98
+ static void xnnpack_bankers_rounding_f32(benchmark::State& state) {
99
+ const size_t batch_size = state.range(0);
100
+
101
+ std::random_device random_device;
102
+ auto rng = std::mt19937(random_device());
103
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
104
+
105
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
106
+ std::vector<float> output(batch_size);
107
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
108
+ std::fill(output.begin(), output.end(), std::nanf(""));
109
+
110
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
111
+ if (status != xnn_status_success) {
112
+ state.SkipWithError("failed to initialize XNNPACK");
113
+ return;
114
+ }
115
+
116
+ xnn_operator_t bankers_rounding_op = nullptr;
117
+ status = xnn_create_bankers_rounding_nc_f32(
118
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
119
+ 0 /* flags */, &bankers_rounding_op);
120
+ if (status != xnn_status_success || bankers_rounding_op == nullptr) {
121
+ state.SkipWithError("failed to create Bankers' Rounding operator");
122
+ return;
123
+ }
124
+
125
+ status = xnn_reshape_bankers_rounding_nc_f32(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
126
+ if (status != xnn_status_success) {
127
+ state.SkipWithError("failed to reshape Bankers' Rounding operator");
128
+ return;
129
+ }
130
+
131
+ status = xnn_setup_bankers_rounding_nc_f32(bankers_rounding_op, input.data(), output.data());
132
+ if (status != xnn_status_success) {
133
+ state.SkipWithError("failed to setup Bankers' Rounding operator");
134
+ return;
135
+ }
136
+
137
+ for (auto _ : state) {
138
+ status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
139
+ if (status != xnn_status_success) {
140
+ state.SkipWithError("failed to run Bankers' Rounding operator");
141
+ return;
142
+ }
143
+ }
144
+
145
+ status = xnn_delete_operator(bankers_rounding_op);
146
+ if (status != xnn_status_success) {
147
+ state.SkipWithError("failed to delete Bankers' Rounding operator");
148
+ return;
149
+ }
150
+
151
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
152
+ if (cpu_frequency != 0) {
153
+ state.counters["cpufreq"] = cpu_frequency;
154
+ }
155
+
156
+ state.counters["elements"] =
157
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
158
+
159
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
160
+ state.counters["bytes"] =
161
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
162
+ }
163
+
164
+ #ifdef BENCHMARK_TENSORFLOW_LITE
165
+ static void tflite_bankers_rounding_f32(benchmark::State& state) {
166
+ const size_t batch_size = state.range(0);
167
+
168
+ std::random_device random_device;
169
+ auto rng = std::mt19937(random_device());
170
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
171
+
172
+ flatbuffers::FlatBufferBuilder builder;
173
+ const flatbuffers::Offset<tflite::OperatorCode> operator_code =
174
+ CreateOperatorCode(builder, tflite::BuiltinOperator_ROUND);
175
+
176
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
177
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
178
+ }};
179
+
180
+ const std::array<int32_t, 1> shape{{
181
+ static_cast<int32_t>(batch_size)
182
+ }};
183
+
184
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
185
+ tflite::CreateTensor(builder,
186
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
187
+ tflite::TensorType_FLOAT32),
188
+ tflite::CreateTensor(builder,
189
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
190
+ tflite::TensorType_FLOAT32),
191
+ }};
192
+
193
+ const std::array<int32_t, 1> op_inputs{{ 0 }};
194
+ const std::array<int32_t, 1> op_outputs{{ 1 }};
195
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
196
+ builder,
197
+ 0 /* opcode_index */,
198
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
199
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
200
+
201
+ const std::array<int32_t, 1> graph_inputs{{ 0 }};
202
+ const std::array<int32_t, 1> graph_outputs{{ 1 }};
203
+ const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
204
+ builder,
205
+ builder.CreateVector(tensors.data(), tensors.size()),
206
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
207
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
208
+ builder.CreateVector(&op, 1));
209
+
210
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
211
+ TFLITE_SCHEMA_VERSION,
212
+ builder.CreateVector(&operator_code, 1),
213
+ builder.CreateVector(&subgraph, 1),
214
+ builder.CreateString("Round model"),
215
+ builder.CreateVector(buffers.data(), buffers.size()));
216
+
217
+ builder.Finish(model_buffer);
218
+
219
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
220
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
221
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
222
+ std::unique_ptr<tflite::Interpreter> interpreter;
223
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
224
+ state.SkipWithError("failed to create TFLite interpreter");
225
+ return;
226
+ }
227
+ interpreter->SetNumThreads(1);
228
+
229
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
230
+ state.SkipWithError("failed to allocate tensors");
231
+ return;
232
+ }
233
+
234
+ std::generate(
235
+ interpreter->typed_tensor<float>(0),
236
+ interpreter->typed_tensor<float>(0) + batch_size,
237
+ std::ref(f32rng));
238
+
239
+ for (auto _ : state) {
240
+ if (interpreter->Invoke() != kTfLiteOk) {
241
+ state.SkipWithError("failed to invoke TFLite interpreter");
242
+ return;
243
+ }
244
+ }
245
+
246
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
247
+ if (cpu_frequency != 0) {
248
+ state.counters["cpufreq"] = cpu_frequency;
249
+ }
250
+
251
+ state.counters["elements"] =
252
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
253
+
254
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
255
+ state.counters["bytes"] =
256
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
257
+
258
+ interpreter.reset();
259
+ }
260
+ #endif // BENCHMARK_TENSORFLOW_LITE
261
+
262
+ BENCHMARK(xnnpack_bankers_rounding_f16)
263
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
264
+ ->UseRealTime();
265
+ BENCHMARK(xnnpack_bankers_rounding_f32)
266
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267
+ ->UseRealTime();
268
+
269
+ #ifdef BENCHMARK_TENSORFLOW_LITE
270
+ BENCHMARK(tflite_bankers_rounding_f32)
271
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
272
+ ->UseRealTime();
273
+ #endif // BENCHMARK_TENSORFLOW_LITE
274
+
275
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
276
+ BENCHMARK_MAIN();
277
+ #endif
bench/batch-matrix-multiply.cc ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+ #include <functional>
10
+ #include <memory>
11
+ #include <random>
12
+ #include <utility>
13
+ #include <vector>
14
+
15
+ #include <xnnpack.h>
16
+
17
+ #include <benchmark/benchmark.h>
18
+ #include "bench/utils.h"
19
+ #ifdef BENCHMARK_TENSORFLOW_LITE
20
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
21
+ #include "tensorflow/lite/interpreter.h"
22
+ #include "tensorflow/lite/kernels/register.h"
23
+ #include "tensorflow/lite/schema/schema_generated.h"
24
+ #include "tensorflow/lite/version.h"
25
+ #endif // BENCHMARK_TENSORFLOW_LITE
26
+
27
+ void xnnpack_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
28
+ const size_t batch_size = state.range(0);
29
+ const size_t m = state.range(1);
30
+ const size_t k = state.range(1);
31
+ const size_t n = state.range(1);
32
+
33
+ std::random_device random_device;
34
+ auto rng = std::mt19937(random_device());
35
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
36
+
37
+ std::vector<float> input1(batch_size * m * k);
38
+ std::generate(input1.begin(), input1.end(), std::ref(f32rng));
39
+ std::vector<float> input2(batch_size * k * n);
40
+ std::generate(input2.begin(), input2.end(), std::ref(f32rng));
41
+ const size_t output_elements = batch_size * m * n;
42
+
43
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
44
+ if (status != xnn_status_success) {
45
+ state.SkipWithError("failed to initialize XNNPACK");
46
+ return;
47
+ }
48
+
49
+ const size_t num_buffers =
50
+ 1 + benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), sizeof(float) * (output_elements));
51
+ std::vector<float> output(output_elements * num_buffers);
52
+
53
+ std::vector<xnn_operator_t> ops(num_buffers);
54
+
55
+ for (xnn_operator_t& op : ops) {
56
+ status = xnn_create_batch_matrix_multiply_nc_f32(/*flags=*/0, &op);
57
+ if (status != xnn_status_success) {
58
+ state.SkipWithError("failed to create FP32 Convolution operator");
59
+ return;
60
+ }
61
+ }
62
+
63
+ std::vector<std::unique_ptr<std::vector<char>>> workspaces;
64
+
65
+ for (xnn_operator_t& op : ops) {
66
+ size_t workspace_size = 0;
67
+ size_t workspace_alignment = 0;
68
+ status =
69
+ xnn_reshape_batch_matrix_multiply_nc_f32(op, batch_size, m, k, n, &workspace_size, &workspace_alignment, nullptr);
70
+
71
+ auto workspace = std::make_unique<std::vector<char>>(workspace_size);
72
+ char* workspace_ptr = workspace->data();
73
+
74
+ workspaces.push_back(std::move(workspace));
75
+
76
+ if (status != xnn_status_success) {
77
+ state.SkipWithError("failed to create FP32 Convolution operator");
78
+ return;
79
+ }
80
+
81
+ status = xnn_setup_batch_matrix_multiply_nc_f32(op, workspace_ptr, input1.data(), input2.data(), output.data());
82
+ }
83
+
84
+ size_t buffer_index = 0;
85
+ for (auto _ : state) {
86
+ state.PauseTiming();
87
+ buffer_index = (buffer_index + 1) % num_buffers;
88
+ state.ResumeTiming();
89
+
90
+ status = xnn_run_operator(ops[buffer_index], /*threadpool=*/nullptr);
91
+ if (status != xnn_status_success) {
92
+ state.SkipWithError("failed to run FP32 Convolution operator");
93
+ return;
94
+ }
95
+ }
96
+
97
+ for (xnn_operator_t& convolution_op : ops) {
98
+ status = xnn_delete_operator(convolution_op);
99
+ if (status != xnn_status_success) {
100
+ state.SkipWithError("failed to delete FP32 Convolution operator");
101
+ return;
102
+ }
103
+ convolution_op = nullptr;
104
+ }
105
+
106
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
107
+ if (cpu_frequency != 0) {
108
+ state.counters["cpufreq"] = cpu_frequency;
109
+ }
110
+
111
+ state.counters["FLOPS"] = benchmark::Counter(
112
+ uint64_t(state.iterations()) * batch_size * m * k * n,
113
+ benchmark::Counter::kIsRate);
114
+ }
115
+
116
+ #ifdef BENCHMARK_TENSORFLOW_LITE
117
+ void tflite_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
118
+ const size_t batch_size = state.range(0);
119
+ const size_t m = state.range(1);
120
+ const size_t k = state.range(1);
121
+ const size_t n = state.range(1);
122
+
123
+ std::random_device random_device;
124
+ auto rng = std::mt19937(random_device());
125
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
126
+
127
+ std::vector<float> input1(batch_size * m * k);
128
+ std::generate(input1.begin(), input1.end(), std::ref(f32rng));
129
+ std::vector<float> input2(batch_size * k * n);
130
+ std::generate(input2.begin(), input2.end(), std::ref(f32rng));
131
+
132
+ flatbuffers::FlatBufferBuilder builder;
133
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
134
+ CreateOperatorCode(builder, tflite::BuiltinOperator_BATCH_MATMUL, 0);
135
+
136
+ flatbuffers::Offset<tflite::BatchMatMulOptions> batch_mat_mul_options =
137
+ tflite::CreateBatchMatMulOptions(builder, false, false, false);
138
+
139
+ flatbuffers::Offset<tflite::Buffer> buffers[1] = {
140
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
141
+ };
142
+
143
+ const int32_t input1_shape[3] = {
144
+ static_cast<int32_t>(batch_size),
145
+ static_cast<int32_t>(m),
146
+ static_cast<int32_t>(k),
147
+ };
148
+ const int32_t input2_shape[3] = {
149
+ static_cast<int32_t>(batch_size),
150
+ static_cast<int32_t>(k),
151
+ static_cast<int32_t>(n),
152
+ };
153
+ const int32_t output_shape[3] = {
154
+ static_cast<int32_t>(batch_size),
155
+ static_cast<int32_t>(m),
156
+ static_cast<int32_t>(n),
157
+ };
158
+
159
+ flatbuffers::Offset<tflite::Tensor> tensors[4] = {
160
+ tflite::CreateTensor(builder,
161
+ builder.CreateVector<int32_t>(input1_shape, 3),
162
+ tflite::TensorType_FLOAT32,
163
+ 0 /* buffer id */,
164
+ builder.CreateString("input1")),
165
+ tflite::CreateTensor(builder,
166
+ builder.CreateVector<int32_t>(input2_shape, 3),
167
+ tflite::TensorType_FLOAT32,
168
+ 0 /* buffer id */,
169
+ builder.CreateString("input2")),
170
+ tflite::CreateTensor(builder,
171
+ builder.CreateVector<int32_t>(output_shape, 2),
172
+ tflite::TensorType_FLOAT32,
173
+ 0 /* buffer id */,
174
+ builder.CreateString("output")),
175
+ };
176
+
177
+ const int32_t op_inputs[2] = { 0, 1 };
178
+ const int32_t op_outputs[1] = { 2 };
179
+ flatbuffers::Offset<tflite::Operator> op = CreateOperator(
180
+ builder,
181
+ 0 /* opcode_index */,
182
+ builder.CreateVector<int32_t>(op_inputs, 2),
183
+ builder.CreateVector<int32_t>(op_outputs, 1),
184
+ tflite::BuiltinOptions_BatchMatMulOptions,
185
+ batch_mat_mul_options.Union());
186
+
187
+ const int32_t graph_inputs[2] = { 0, 1 };
188
+ const int32_t graph_outputs[1] = { 2 };
189
+ flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
190
+ builder,
191
+ builder.CreateVector(tensors, 3),
192
+ builder.CreateVector<int32_t>(graph_inputs, 2),
193
+ builder.CreateVector<int32_t>(graph_outputs, 1),
194
+ builder.CreateVector(&op, 1),
195
+ builder.CreateString("BatchMatMul subgraph"));
196
+
197
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("BatchMatMul model");
198
+
199
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
200
+ TFLITE_SCHEMA_VERSION,
201
+ builder.CreateVector(&operator_code, 1),
202
+ builder.CreateVector(&subgraph, 1),
203
+ description,
204
+ builder.CreateVector(buffers, 1));
205
+
206
+ builder.Finish(model_buffer);
207
+
208
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
209
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
210
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
211
+ std::unique_ptr<tflite::Interpreter> interpreter;
212
+ if (interpreterBuilder(&interpreter) != kTfLiteOk) {
213
+ state.SkipWithError("failed to create TFLite interpreter");
214
+ return;
215
+ }
216
+ if (interpreter == nullptr) {
217
+ state.SkipWithError("TFLite interpreter is null");
218
+ return;
219
+ }
220
+ interpreter->SetNumThreads(1);
221
+
222
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
223
+ state.SkipWithError("failed to allocate tensors");
224
+ return;
225
+ }
226
+
227
+ std::generate(
228
+ interpreter->typed_tensor<float>(0),
229
+ interpreter->typed_tensor<float>(0) + batch_size * m * k,
230
+ std::ref(f32rng));
231
+
232
+ std::generate(
233
+ interpreter->typed_tensor<float>(1),
234
+ interpreter->typed_tensor<float>(1) + batch_size * k * n,
235
+ std::ref(f32rng));
236
+
237
+ for (auto _ : state) {
238
+ if (interpreter->Invoke() != kTfLiteOk) {
239
+ state.SkipWithError("failed to invoke TFLite interpreter");
240
+ return;
241
+ }
242
+ }
243
+
244
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
245
+ if (cpu_frequency != 0) {
246
+ state.counters["cpufreq"] = cpu_frequency;
247
+ }
248
+
249
+ state.counters["FLOPS"] = benchmark::Counter(
250
+ uint64_t(state.iterations()) * batch_size * m * k * n,
251
+ benchmark::Counter::kIsRate);
252
+
253
+ interpreter.reset();
254
+ }
255
+ #endif // BENCHMARK_TENSORFLOW_LITE
256
+
257
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
258
+ BENCHMARK_MAIN();
259
+ #endif
bench/bf16-gemm.cc ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/gemm.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/gemm.h>
22
+ #include <xnnpack/math.h>
23
+ #include <xnnpack/pack.h>
24
+ #include <xnnpack/microfnptr.h>
25
+ #include <xnnpack/microparams-init.h>
26
+
27
+
28
+ static void bf16_gemm(benchmark::State& state,
29
+ xnn_bf16_gemm_minmax_ukernel_fn gemm,
30
+ size_t mr, size_t nr, size_t kr, size_t sr,
31
+ xnn_init_bf16_minmax_params_fn init_params,
32
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
33
+ {
34
+ if (isa_check != nullptr && !isa_check(state)) {
35
+ return;
36
+ }
37
+
38
+ const size_t mc = state.range(0);
39
+ const size_t nc = state.range(1);
40
+ const size_t kc = state.range(2);
41
+
42
+ const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
43
+ const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
44
+
45
+ std::random_device random_device;
46
+ auto rng = std::mt19937(random_device());
47
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
48
+
49
+ std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
50
+ std::generate(a.begin(), a.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
51
+ std::vector<uint16_t> k(nc * kc);
52
+ std::generate(k.begin(), k.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
53
+ std::vector<uint16_t> b(nc);
54
+ std::generate(b.begin(), b.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
55
+
56
+ const size_t w_elements = nc_stride * kc_stride + nc_stride;
57
+ const size_t c_elements = mc * nc;
58
+ const size_t num_buffers = 1 +
59
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
60
+ sizeof(uint16_t) * (w_elements + c_elements));
61
+
62
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
63
+ std::fill(w.begin(), w.end(), 0);
64
+ xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
65
+ std::vector<uint16_t> c(c_elements * num_buffers);
66
+ std::fill(c.begin(), c.end(), UINT16_C(0x7FC0) /* NaN */);
67
+
68
+ // Prepare minmax parameters.
69
+ xnn_bf16_minmax_params params;
70
+ init_params(&params,
71
+ UINT16_C(0xFF80) /* -inf */, UINT16_C(0x7F80) /* inf */);
72
+
73
+ size_t buffer_index = 0;
74
+ for (auto _ : state) {
75
+ // Use circular buffers (exceeding cache size) and prefetch to control cache state:
76
+ // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
77
+ // - W is not in cache (for any cache level)
78
+ // - C is not in cache (for any cache level)
79
+ state.PauseTiming();
80
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
81
+ buffer_index = (buffer_index + 1) % num_buffers;
82
+ state.ResumeTiming();
83
+
84
+ for (uint32_t m = 0; m < mc; m += mr) {
85
+ const uint32_t mb = min(mc - m, mr);
86
+ for (uint32_t n = 0; n < nc; n += nr) {
87
+ const uint32_t nb = min(nc - n, nr);
88
+ gemm(
89
+ mb, nb, kc * sizeof(uint16_t),
90
+ a.data() + m * kc, kc * sizeof(uint16_t),
91
+ w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
92
+ c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
93
+ &params);
94
+ }
95
+ }
96
+ }
97
+
98
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
99
+ if (cpu_frequency != 0) {
100
+ state.counters["cpufreq"] = cpu_frequency;
101
+ }
102
+
103
+ state.counters["FLOPS"] = benchmark::Counter(
104
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
105
+ }
106
+
107
+
108
+ #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
109
+ static void bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
110
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 1, 8, 2, 1,
111
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
112
+ }
113
+ static void bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
114
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 4, 8, 2, 1,
115
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
116
+ }
117
+ static void bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
118
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 5, 8, 2, 1,
119
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
120
+ }
121
+ static void bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
122
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 6, 8, 2, 1,
123
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
124
+ }
125
+
126
+ static void bf16_gemm_1x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
127
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 1, 4, 8, 1,
128
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
129
+ }
130
+ static void bf16_gemm_2x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
131
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 2, 4, 8, 1,
132
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
133
+ }
134
+ static void bf16_gemm_3x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
135
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 3, 4, 8, 1,
136
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
137
+ }
138
+ static void bf16_gemm_4x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
139
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 4, 4, 8, 1,
140
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
141
+ }
142
+ static void bf16_gemm_5x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
143
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 5, 4, 8, 1,
144
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
145
+ }
146
+
147
+ static void bf16_gemm_1x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
148
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 1, 4, 8, 1,
149
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
150
+ }
151
+ static void bf16_gemm_2x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
152
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 2, 4, 8, 1,
153
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
154
+ }
155
+ static void bf16_gemm_3x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
156
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 3, 4, 8, 1,
157
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
158
+ }
159
+ static void bf16_gemm_4x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
160
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 4, 4, 8, 1,
161
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
162
+ }
163
+ static void bf16_gemm_5x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
164
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 5, 4, 8, 1,
165
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
166
+ }
167
+
168
+ BENCHMARK_GEMM(bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128)
169
+ BENCHMARK_GEMM(bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128)
170
+ BENCHMARK_GEMM(bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128)
171
+ BENCHMARK_GEMM(bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128)
172
+
173
+ BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfdot)
174
+ BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfdot)
175
+ BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfdot)
176
+ BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfdot)
177
+ BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfdot)
178
+
179
+ BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfmlal)
180
+ BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfmlal)
181
+ BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfmlal)
182
+ BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfmlal)
183
+ BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfmlal)
184
+ #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
185
+
186
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
187
+ static void bf16_gemm_1x4c8__neonfma_zip(benchmark::State& state, const char* net) {
188
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 1, 4, 8, 1,
189
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
190
+ }
191
+ static void bf16_gemm_2x4c8__neonfma_zip(benchmark::State& state, const char* net) {
192
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 2, 4, 8, 1,
193
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
194
+ }
195
+ static void bf16_gemm_3x4c8__neonfma_zip(benchmark::State& state, const char* net) {
196
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 3, 4, 8, 1,
197
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
198
+ }
199
+ static void bf16_gemm_4x4c8__neonfma_zip(benchmark::State& state, const char* net) {
200
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 4, 4, 8, 1,
201
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
202
+ }
203
+ static void bf16_gemm_5x4c8__neonfma_zip(benchmark::State& state, const char* net) {
204
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 5, 4, 8, 1,
205
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
206
+ }
207
+
208
+ static void bf16_gemm_1x4c8__neonfma_shland(benchmark::State& state, const char* net) {
209
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 1, 4, 8, 1,
210
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
211
+ }
212
+ static void bf16_gemm_2x4c8__neonfma_shland(benchmark::State& state, const char* net) {
213
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 2, 4, 8, 1,
214
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
215
+ }
216
+ static void bf16_gemm_3x4c8__neonfma_shland(benchmark::State& state, const char* net) {
217
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 3, 4, 8, 1,
218
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
219
+ }
220
+ static void bf16_gemm_4x4c8__neonfma_shland(benchmark::State& state, const char* net) {
221
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 4, 4, 8, 1,
222
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
223
+ }
224
+ static void bf16_gemm_5x4c8__neonfma_shland(benchmark::State& state, const char* net) {
225
+ bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 5, 4, 8, 1,
226
+ xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
227
+ }
228
+
229
+ BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_zip)
230
+ BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_zip)
231
+ BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_zip)
232
+ BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_zip)
233
+ BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_zip)
234
+
235
+ BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_shland)
236
+ BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_shland)
237
+ BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_shland)
238
+ BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_shland)
239
+ BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_shland)
240
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
241
+
242
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
243
+ BENCHMARK_MAIN();
244
+ #endif
bench/bgemm.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #pragma once
7
+
8
+ #include <benchmark/benchmark.h>
9
+
10
+ #define BENCHMARK_BGEMM(bgemm_fn) \
11
+ BENCHMARK_CAPTURE(bgemm_fn, albert, "Albert")->Apply(AlbertBgemmArguments)->UseRealTime(); \
12
+ BENCHMARK_CAPTURE(bgemm_fn, mobilebert, "MobileBert")->Apply(MobilebertBgemmArguments)->UseRealTime(); \
13
+ BENCHMARK_CAPTURE(bgemm_fn, sd1x_diffusion, "SD1.X Diffusion")->Apply(SD1XDiffusionBgemmArguments)->UseRealTime(); \
14
+ BENCHMARK_CAPTURE(bgemm_fn, sd1x_encoder_decoder, "SD1.X Encoder-Decoder")->Apply(SD1XEncoderDecoderBgemmArguments)->UseRealTime(); \
15
+ BENCHMARK_CAPTURE(bgemm_fn, sd1x_text_encoder, "SD1.X Text Encoder")->Apply(SD1XTextEncoderBgemmArguments)->UseRealTime();
16
+
17
+
18
+ static void AlbertBgemmArguments(benchmark::internal::Benchmark* b) {
19
+ b->ArgNames({"B", "M", "N", "K"});
20
+
21
+ /* B M N K */
22
+ b->Args({12, 384, 64, 384});
23
+ b->Args({12, 384, 384, 64});
24
+ }
25
+
26
+ static void MobilebertBgemmArguments(benchmark::internal::Benchmark* b) {
27
+ b->ArgNames({"B", "M", "N", "K"});
28
+
29
+ /* B M N K */
30
+ b->Args({4, 384, 32, 384});
31
+ b->Args({4, 384, 384, 32});
32
+ }
33
+
34
+ static void SD1XDiffusionBgemmArguments(benchmark::internal::Benchmark* b) {
35
+ b->ArgNames({"B", "M", "N", "K"});
36
+
37
+ /* B M N K */
38
+ b->Args({8, 4096, 4096, 40});
39
+ b->Args({8, 4096, 40, 4096});
40
+ b->Args({8, 4096, 77, 40});
41
+ b->Args({8, 4096, 40, 77});
42
+ b->Args({8, 1024, 1024, 80});
43
+ b->Args({8, 1024, 80, 1024});
44
+ b->Args({8, 1024, 77, 80});
45
+ b->Args({8, 1024, 80, 77});
46
+ b->Args({8, 256, 256, 160});
47
+ b->Args({8, 256, 160, 256});
48
+ b->Args({8, 256, 77, 160});
49
+ b->Args({8, 256, 160, 77});
50
+ b->Args({8, 64, 64, 160});
51
+ b->Args({8, 64, 160, 64});
52
+ b->Args({8, 64, 77, 160});
53
+ b->Args({8, 64, 160, 77});
54
+ }
55
+
56
+ static void SD1XEncoderDecoderBgemmArguments(benchmark::internal::Benchmark* b) {
57
+ b->ArgNames({"B", "M", "N", "K"});
58
+
59
+ /* B M N K */
60
+ b->Args({1, 4096, 4096, 512});
61
+ b->Args({1, 512, 4096, 4096});
62
+ }
63
+
64
+ static void SD1XTextEncoderBgemmArguments(benchmark::internal::Benchmark* b) {
65
+ b->ArgNames({"B", "M", "N", "K"});
66
+
67
+ /* B M N K */
68
+ b->Args({12, 77, 77, 64});
69
+ b->Args({12, 77, 64, 77});
70
+ }
bench/ceiling.cc ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2020 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <limits>
11
+ #include <memory>
12
+ #include <random>
13
+ #include <vector>
14
+
15
+ #include <fp16/fp16.h>
16
+
17
+ #include <xnnpack.h>
18
+
19
+ #include <benchmark/benchmark.h>
20
+ #include "bench/utils.h"
21
+ #ifdef BENCHMARK_TENSORFLOW_LITE
22
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23
+ #include "tensorflow/lite/interpreter.h"
24
+ #include "tensorflow/lite/kernels/register.h"
25
+ #include "tensorflow/lite/model.h"
26
+ #include "tensorflow/lite/schema/schema_generated.h"
27
+ #include "tensorflow/lite/version.h"
28
+ #endif // BENCHMARK_TENSORFLOW_LITE
29
+
30
+
31
+ static void xnnpack_ceiling_f16(benchmark::State& state) {
32
+ const size_t batch_size = state.range(0);
33
+
34
+ std::random_device random_device;
35
+ auto rng = std::mt19937(random_device());
36
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
37
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
38
+
39
+ std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
40
+ std::vector<uint16_t> output(batch_size);
41
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
42
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
43
+
44
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
45
+ if (status != xnn_status_success) {
46
+ state.SkipWithError("failed to initialize XNNPACK");
47
+ return;
48
+ }
49
+
50
+ xnn_operator_t ceiling_op = nullptr;
51
+ status = xnn_create_ceiling_nc_f16(
52
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
53
+ 0 /* flags */, &ceiling_op);
54
+ if (status != xnn_status_success || ceiling_op == nullptr) {
55
+ state.SkipWithError("failed to create Ceiling operator");
56
+ return;
57
+ }
58
+
59
+ status = xnn_reshape_ceiling_nc_f16(ceiling_op, batch_size, /*threadpool=*/nullptr);
60
+ if (status != xnn_status_success) {
61
+ state.SkipWithError("failed to reshape Ceiling operator");
62
+ return;
63
+ }
64
+
65
+ status = xnn_setup_ceiling_nc_f16(ceiling_op, input.data(), output.data());
66
+ if (status != xnn_status_success) {
67
+ state.SkipWithError("failed to setup Ceiling operator");
68
+ return;
69
+ }
70
+
71
+ for (auto _ : state) {
72
+ status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
73
+ if (status != xnn_status_success) {
74
+ state.SkipWithError("failed to run Ceiling operator");
75
+ return;
76
+ }
77
+ }
78
+
79
+ status = xnn_delete_operator(ceiling_op);
80
+ if (status != xnn_status_success) {
81
+ state.SkipWithError("failed to delete Ceiling operator");
82
+ return;
83
+ }
84
+
85
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86
+ if (cpu_frequency != 0) {
87
+ state.counters["cpufreq"] = cpu_frequency;
88
+ }
89
+
90
+ state.counters["elements"] =
91
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
92
+
93
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
94
+ state.counters["bytes"] =
95
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
96
+ }
97
+
98
+ static void xnnpack_ceiling_f32(benchmark::State& state) {
99
+ const size_t batch_size = state.range(0);
100
+
101
+ std::random_device random_device;
102
+ auto rng = std::mt19937(random_device());
103
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
104
+
105
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
106
+ std::vector<float> output(batch_size);
107
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
108
+ std::fill(output.begin(), output.end(), std::nanf(""));
109
+
110
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
111
+ if (status != xnn_status_success) {
112
+ state.SkipWithError("failed to initialize XNNPACK");
113
+ return;
114
+ }
115
+
116
+ xnn_operator_t ceiling_op = nullptr;
117
+ status = xnn_create_ceiling_nc_f32(
118
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
119
+ 0 /* flags */, &ceiling_op);
120
+ if (status != xnn_status_success || ceiling_op == nullptr) {
121
+ state.SkipWithError("failed to create Ceiling operator");
122
+ return;
123
+ }
124
+
125
+ status = xnn_reshape_ceiling_nc_f32(ceiling_op, batch_size, /*threadpool=*/nullptr);
126
+ if (status != xnn_status_success) {
127
+ state.SkipWithError("failed to reshape Ceiling operator");
128
+ return;
129
+ }
130
+
131
+ status = xnn_setup_ceiling_nc_f32(ceiling_op, input.data(), output.data());
132
+ if (status != xnn_status_success) {
133
+ state.SkipWithError("failed to setup Ceiling operator");
134
+ return;
135
+ }
136
+
137
+ for (auto _ : state) {
138
+ status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
139
+ if (status != xnn_status_success) {
140
+ state.SkipWithError("failed to run Ceiling operator");
141
+ return;
142
+ }
143
+ }
144
+
145
+ status = xnn_delete_operator(ceiling_op);
146
+ if (status != xnn_status_success) {
147
+ state.SkipWithError("failed to delete Ceiling operator");
148
+ return;
149
+ }
150
+
151
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
152
+ if (cpu_frequency != 0) {
153
+ state.counters["cpufreq"] = cpu_frequency;
154
+ }
155
+
156
+ state.counters["elements"] =
157
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
158
+
159
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
160
+ state.counters["bytes"] =
161
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
162
+ }
163
+
164
+ #ifdef BENCHMARK_TENSORFLOW_LITE
165
+ static void tflite_ceiling_f32(benchmark::State& state) {
166
+ const size_t batch_size = state.range(0);
167
+
168
+ std::random_device random_device;
169
+ auto rng = std::mt19937(random_device());
170
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
171
+
172
+ flatbuffers::FlatBufferBuilder builder;
173
+ const flatbuffers::Offset<tflite::OperatorCode> operator_code =
174
+ CreateOperatorCode(builder, tflite::BuiltinOperator_CEIL);
175
+
176
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
177
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
178
+ }};
179
+
180
+ const std::array<int32_t, 1> shape{{
181
+ static_cast<int32_t>(batch_size)
182
+ }};
183
+
184
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
185
+ tflite::CreateTensor(builder,
186
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
187
+ tflite::TensorType_FLOAT32),
188
+ tflite::CreateTensor(builder,
189
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
190
+ tflite::TensorType_FLOAT32),
191
+ }};
192
+
193
+ const std::array<int32_t, 1> op_inputs{{ 0 }};
194
+ const std::array<int32_t, 1> op_outputs{{ 1 }};
195
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
196
+ builder,
197
+ 0 /* opcode_index */,
198
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
199
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
200
+
201
+ const std::array<int32_t, 1> graph_inputs{{ 0 }};
202
+ const std::array<int32_t, 1> graph_outputs{{ 1 }};
203
+ const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
204
+ builder,
205
+ builder.CreateVector(tensors.data(), tensors.size()),
206
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
207
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
208
+ builder.CreateVector(&op, 1));
209
+
210
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
211
+ TFLITE_SCHEMA_VERSION,
212
+ builder.CreateVector(&operator_code, 1),
213
+ builder.CreateVector(&subgraph, 1),
214
+ builder.CreateString("Ceil model"),
215
+ builder.CreateVector(buffers.data(), buffers.size()));
216
+
217
+ builder.Finish(model_buffer);
218
+
219
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
220
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
221
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
222
+ std::unique_ptr<tflite::Interpreter> interpreter;
223
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
224
+ state.SkipWithError("failed to create TFLite interpreter");
225
+ return;
226
+ }
227
+ interpreter->SetNumThreads(1);
228
+
229
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
230
+ state.SkipWithError("failed to allocate tensors");
231
+ return;
232
+ }
233
+
234
+ std::generate(
235
+ interpreter->typed_tensor<float>(0),
236
+ interpreter->typed_tensor<float>(0) + batch_size,
237
+ std::ref(f32rng));
238
+
239
+ for (auto _ : state) {
240
+ if (interpreter->Invoke() != kTfLiteOk) {
241
+ state.SkipWithError("failed to invoke TFLite interpreter");
242
+ return;
243
+ }
244
+ }
245
+
246
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
247
+ if (cpu_frequency != 0) {
248
+ state.counters["cpufreq"] = cpu_frequency;
249
+ }
250
+
251
+ state.counters["elements"] =
252
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
253
+
254
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
255
+ state.counters["bytes"] =
256
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
257
+
258
+ interpreter.reset();
259
+ }
260
+ #endif // BENCHMARK_TENSORFLOW_LITE
261
+
262
+ BENCHMARK(xnnpack_ceiling_f16)
263
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
264
+ ->UseRealTime();
265
+ BENCHMARK(xnnpack_ceiling_f32)
266
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267
+ ->UseRealTime();
268
+
269
+ #ifdef BENCHMARK_TENSORFLOW_LITE
270
+ BENCHMARK(tflite_ceiling_f32)
271
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
272
+ ->UseRealTime();
273
+ #endif // BENCHMARK_TENSORFLOW_LITE
274
+
275
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
276
+ BENCHMARK_MAIN();
277
+ #endif
bench/channel-shuffle.cc ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #include <algorithm>
10
+ #include <cmath>
11
+ #include <functional>
12
+ #include <limits>
13
+ #include <random>
14
+ #include <vector>
15
+
16
+ #include <xnnpack.h>
17
+
18
+ #include <benchmark/benchmark.h>
19
+ #include "bench/utils.h"
20
+
21
+
22
+ static void channel_shuffle_x8(benchmark::State& state, const char* net) {
23
+ const size_t batch_size = static_cast<size_t>(state.range(0));
24
+ const size_t groups = static_cast<size_t>(state.range(1));
25
+ const size_t group_channels = static_cast<size_t>(state.range(2));
26
+
27
+ std::random_device random_device;
28
+ auto rng = std::mt19937(random_device());
29
+ auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
30
+
31
+ std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels);
32
+ std::vector<uint8_t> output(batch_size * groups * group_channels);
33
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
34
+
35
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
36
+ if (status != xnn_status_success) {
37
+ state.SkipWithError("failed to initialize XNNPACK");
38
+ return;
39
+ }
40
+
41
+ xnn_operator_t channel_shuffle_op = nullptr;
42
+ status = xnn_create_channel_shuffle_nc_x8(
43
+ groups, group_channels,
44
+ groups * group_channels /* input stride */,
45
+ groups * group_channels /* output stride */,
46
+ 0 /* flags */, &channel_shuffle_op);
47
+ if (status != xnn_status_success || channel_shuffle_op == nullptr) {
48
+ state.SkipWithError("failed to create X8 Channel Shuffle operator");
49
+ return;
50
+ }
51
+
52
+ status = xnn_reshape_channel_shuffle_nc_x8(
53
+ channel_shuffle_op,
54
+ batch_size,
55
+ nullptr /* thread pool */);
56
+ if (status != xnn_status_success) {
57
+ state.SkipWithError("failed to reshape X8 Channel Shuffle operator");
58
+ return;
59
+ }
60
+
61
+ status = xnn_setup_channel_shuffle_nc_x8(
62
+ channel_shuffle_op,
63
+ input.data(), output.data());
64
+ if (status != xnn_status_success) {
65
+ state.SkipWithError("failed to setup X8 Channel Shuffle operator");
66
+ return;
67
+ }
68
+
69
+ for (auto _ : state) {
70
+ status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
71
+ if (status != xnn_status_success) {
72
+ state.SkipWithError("failed to run X8 Channel Shuffle operator");
73
+ return;
74
+ }
75
+ }
76
+
77
+ status = xnn_delete_operator(channel_shuffle_op);
78
+ if (status != xnn_status_success) {
79
+ state.SkipWithError("failed to delete X8 Channel Shuffle operator");
80
+ return;
81
+ }
82
+
83
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
84
+ if (cpu_frequency != 0) {
85
+ state.counters["cpufreq"] = cpu_frequency;
86
+ }
87
+
88
+ const size_t elements_per_iteration = batch_size * groups * group_channels;
89
+ state.counters["elements"] =
90
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
91
+
92
+ const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t);
93
+ state.counters["bytes"] =
94
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
95
+ }
96
+
97
+ static void channel_shuffle_x32(benchmark::State& state, const char* net) {
98
+ const size_t batch_size = static_cast<size_t>(state.range(0));
99
+ const size_t groups = static_cast<size_t>(state.range(1));
100
+ const size_t group_channels = static_cast<size_t>(state.range(2));
101
+
102
+ std::random_device random_device;
103
+ auto rng = std::mt19937(random_device());
104
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
105
+
106
+ std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + batch_size * groups * group_channels);
107
+ std::vector<float> output(batch_size * groups * group_channels);
108
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
109
+
110
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
111
+ if (status != xnn_status_success) {
112
+ state.SkipWithError("failed to initialize XNNPACK");
113
+ return;
114
+ }
115
+
116
+ xnn_operator_t channel_shuffle_op = nullptr;
117
+ status = xnn_create_channel_shuffle_nc_x32(
118
+ groups, group_channels,
119
+ groups * group_channels /* input stride */,
120
+ groups * group_channels /* output stride */,
121
+ 0 /* flags */, &channel_shuffle_op);
122
+ if (status != xnn_status_success || channel_shuffle_op == nullptr) {
123
+ state.SkipWithError("failed to create X32 Channel Shuffle operator");
124
+ return;
125
+ }
126
+
127
+ status = xnn_reshape_channel_shuffle_nc_x32(
128
+ channel_shuffle_op,
129
+ batch_size,
130
+ nullptr /* thread pool */);
131
+ if (status != xnn_status_success) {
132
+ state.SkipWithError("failed to reshape X32 Channel Shuffle operator");
133
+ return;
134
+ }
135
+
136
+ status = xnn_setup_channel_shuffle_nc_x32(
137
+ channel_shuffle_op,
138
+ input.data(), output.data());
139
+ if (status != xnn_status_success) {
140
+ state.SkipWithError("failed to setup X32 Channel Shuffle operator");
141
+ return;
142
+ }
143
+
144
+ for (auto _ : state) {
145
+ status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
146
+ if (status != xnn_status_success) {
147
+ state.SkipWithError("failed to run X32 Channel Shuffle operator");
148
+ return;
149
+ }
150
+ }
151
+
152
+ status = xnn_delete_operator(channel_shuffle_op);
153
+ if (status != xnn_status_success) {
154
+ state.SkipWithError("failed to delete X32 Channel Shuffle operator");
155
+ return;
156
+ }
157
+
158
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
159
+ if (cpu_frequency != 0) {
160
+ state.counters["cpufreq"] = cpu_frequency;
161
+ }
162
+
163
+ const size_t elements_per_iteration = batch_size * groups * group_channels;
164
+ state.counters["elements"] =
165
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
166
+
167
+ const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
168
+ state.counters["bytes"] =
169
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
170
+ }
171
+
172
+ static void ShuffleNetV1G2Arguments(benchmark::internal::Benchmark* b)
173
+ {
174
+ b->ArgNames({"N", "G", "GC"});
175
+
176
+ /******** Stage 2 ********/
177
+ /* H W G CG */
178
+ b->Args({56 * 56, 2, 25});
179
+ b->Args({28 * 28, 2, 25});
180
+
181
+ /******** Stage 3 ********/
182
+ /* H W G CG */
183
+ b->Args({28 * 28, 2, 50});
184
+ b->Args({14 * 14, 2, 50});
185
+
186
+ /******** Stage 4 ********/
187
+ /* H W G CG */
188
+ b->Args({14 * 14, 2, 100});
189
+ b->Args({ 7 * 7, 2, 100});
190
+ }
191
+
192
+ static void ShuffleNetV1G3Arguments(benchmark::internal::Benchmark* b)
193
+ {
194
+ b->ArgNames({"N", "G", "GC"});
195
+
196
+ /******** Stage 2 *******/
197
+ /* H W G CG */
198
+ b->Args({56 * 56, 3, 20});
199
+ b->Args({28 * 28, 3, 20});
200
+
201
+ /******** Stage 3 *******/
202
+ /* H W G CG */
203
+ b->Args({28 * 28, 3, 40});
204
+ b->Args({14 * 14, 3, 40});
205
+
206
+ /******** Stage 4 *******/
207
+ /* H W G CG */
208
+ b->Args({14 * 14, 3, 80});
209
+ b->Args({ 7 * 7, 3, 80});
210
+ }
211
+
212
+ static void ShuffleNetV1G4Arguments(benchmark::internal::Benchmark* b)
213
+ {
214
+ b->ArgNames({"N", "G", "GC"});
215
+
216
+ /******** Stage 2 *******/
217
+ /* H W G CG */
218
+ b->Args({56 * 56, 4, 17});
219
+ b->Args({28 * 28, 4, 17});
220
+
221
+ /******** Stage 3 *******/
222
+ /* H W G CG */
223
+ b->Args({28 * 28, 4, 34});
224
+ b->Args({14 * 14, 4, 34});
225
+
226
+ /******** Stage 4 *******/
227
+ /* H W G CG */
228
+ b->Args({14 * 14, 4, 68});
229
+ b->Args({ 7 * 7, 4, 68});
230
+ }
231
+
232
+ static void ShuffleNetV1G8Arguments(benchmark::internal::Benchmark* b)
233
+ {
234
+ b->ArgNames({"N", "G", "GC"});
235
+
236
+ /******** Stage 2 *******/
237
+ /* H W G CG */
238
+ b->Args({56 * 56, 8, 12});
239
+ b->Args({28 * 28, 8, 12});
240
+
241
+ /******** Stage 3 *******/
242
+ /* H W G CG */
243
+ b->Args({28 * 28, 8, 24});
244
+ b->Args({14 * 14, 8, 24});
245
+
246
+ /******** Stage 4 *******/
247
+ /* H W G CG */
248
+ b->Args({14 * 14, 8, 48});
249
+ b->Args({ 7 * 7, 8, 48});
250
+ }
251
+
252
+ static void ShuffleNetV2x0_5Arguments(benchmark::internal::Benchmark* b)
253
+ {
254
+ b->ArgNames({"N", "G", "GC"});
255
+
256
+ /******** Stage 2 *******/
257
+ /* H W G CG */
258
+ b->Args({28 * 28, 2, 24});
259
+
260
+ /******** Stage 3 *******/
261
+ /* H W G CG */
262
+ b->Args({14 * 14, 2, 48});
263
+
264
+ /******** Stage 4 *******/
265
+ /* H W G CG */
266
+ b->Args({ 7 * 7, 2, 96});
267
+ }
268
+
269
+ static void ShuffleNetV2x1_0Arguments(benchmark::internal::Benchmark* b)
270
+ {
271
+ b->ArgNames({"N", "G", "GC"});
272
+
273
+ /******** Stage 2 ********/
274
+ /* H W G CG */
275
+ b->Args({28 * 28, 2, 58});
276
+
277
+ /******** Stage 3 ********/
278
+ /* H W G CG */
279
+ b->Args({14 * 14, 2, 116});
280
+
281
+ /******** Stage 4 ********/
282
+ /* H W G CG */
283
+ b->Args({ 7 * 7, 2, 232});
284
+ }
285
+
286
+ static void ShuffleNetV2x1_5Arguments(benchmark::internal::Benchmark* b)
287
+ {
288
+ b->ArgNames({"N", "G", "GC"});
289
+
290
+ /******** Stage 2 ********/
291
+ /* H W G CG */
292
+ b->Args({28 * 28, 2, 88});
293
+
294
+ /******** Stage 3 ********/
295
+ /* H W G CG */
296
+ b->Args({14 * 14, 2, 176});
297
+
298
+ /******** Stage 4 ********/
299
+ /* H W G CG */
300
+ b->Args({ 7 * 7, 2, 352});
301
+ }
302
+
303
+ static void ShuffleNetV2x2_0Arguments(benchmark::internal::Benchmark* b)
304
+ {
305
+ b->ArgNames({"N", "G", "GC"});
306
+
307
+ /******** Stage 2 ********/
308
+ /* H W G CG */
309
+ b->Args({28 * 28, 2, 122});
310
+
311
+ /******** Stage 3 ********/
312
+ /* H W G CG */
313
+ b->Args({14 * 14, 2, 244});
314
+
315
+ /******** Stage 4 ********/
316
+ /* H W G CG */
317
+ b->Args({ 7 * 7, 2, 488});
318
+ }
319
+
320
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
321
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
322
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
323
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
324
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
325
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
326
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
327
+ BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
328
+
329
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
330
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
331
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
332
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
333
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
334
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
335
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
336
+ BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
337
+
338
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
339
+ BENCHMARK_MAIN();
340
+ #endif
bench/conv.h ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #pragma once
10
+
11
+ #include <benchmark/benchmark.h>
12
+
13
+
14
+ #define BENCHMARK_CONV(conv_fn) \
15
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1ConvArguments)->UseRealTime(); \
16
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2ConvArguments)->UseRealTime(); \
17
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallConvArguments)->UseRealTime(); \
18
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeConvArguments)->UseRealTime(); \
19
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1ConvArguments)->UseRealTime(); \
20
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2ConvArguments)->UseRealTime(); \
21
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3ConvArguments)->UseRealTime(); \
22
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4ConvArguments)->UseRealTime(); \
23
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8ConvArguments)->UseRealTime(); \
24
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05ConvArguments)->UseRealTime(); \
25
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10ConvArguments)->UseRealTime(); \
26
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15ConvArguments)->UseRealTime(); \
27
+ BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20ConvArguments)->UseRealTime(); \
28
+ BENCHMARK_CAPTURE(conv_fn, inception_v3, "Inception v3")->Apply(InceptionV3ConvArguments)->UseRealTime(); \
29
+ BENCHMARK_CAPTURE(conv_fn, resnet18, "ResNet-18")->Apply(ResNet18ConvArguments)->UseRealTime(); \
30
+ BENCHMARK_CAPTURE(conv_fn, resnet50, "ResNet-50")->Apply(ResNet50ConvArguments)->UseRealTime(); \
31
+ BENCHMARK_CAPTURE(conv_fn, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10ConvArguments)->UseRealTime(); \
32
+ BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime(); \
33
+ BENCHMARK_CAPTURE(conv_fn, vgg, "VGG")->Apply(VGGConvArguments)->UseRealTime(); \
34
+ BENCHMARK_CAPTURE(conv_fn, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915ConvArguments)->UseRealTime(); \
35
+ BENCHMARK_CAPTURE(conv_fn, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935ConvArguments)->UseRealTime(); \
36
+ BENCHMARK_CAPTURE(conv_fn, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955ConvArguments)->UseRealTime();
37
+
38
+
39
+ // ShuffleNet v1 with 1 group.
40
+ static void ShuffleNetV1G1ConvArguments(benchmark::internal::Benchmark* b) {
41
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
42
+
43
+ /*********************** Conv 1 **********************/
44
+ /* H W KH KW PH PW S D GCin GCout */
45
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
46
+ /*************** Stage 2: stride-2 unit **************/
47
+ /* H W KH KW PH PW S D GCin GCout */
48
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 36});
49
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 36, 120});
50
+ /*************** Stage 2: stride-1 units *************/
51
+ /* H W KH KW PH PW S D GCin GCout */
52
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 36});
53
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 36, 144});
54
+ /*************** Stage 3: stride-2 unit **************/
55
+ /* H W KH KW PH PW S D GCin GCout */
56
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 72});
57
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 72, 144});
58
+ /*************** Stage 3: stride-1 units *************/
59
+ /* H W KH KW PH PW S D GCin GCout */
60
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 288, 72});
61
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 72, 288});
62
+ /*************** Stage 4: stride-2 unit **************/
63
+ /* H W KH KW PH PW S D GCin GCout */
64
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 288, 144});
65
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 144, 288});
66
+ /*************** Stage 4: stride-1 units *************/
67
+ /* H W KH KW PH PW S D GCin GCout */
68
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 144});
69
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 144, 576});
70
+ }
71
+
72
+ // ShuffleNet v1 with 2 groups.
73
+ static void ShuffleNetV1G2ConvArguments(benchmark::internal::Benchmark* b) {
74
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
75
+
76
+ /*********************** Conv 1 **********************/
77
+ /* H W KH KW PH PW S D GCin GCout */
78
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
79
+ /*************** Stage 2: stride-2 unit **************/
80
+ /* H W KH KW PH PW S D GCin GCout */
81
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 50});
82
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 25, 88});
83
+ /*************** Stage 2: stride-1 units *************/
84
+ /* H W KH KW PH PW S D GCin GCout */
85
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 100, 25});
86
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 25, 100});
87
+ /*************** Stage 3: stride-2 unit **************/
88
+ /* H W KH KW PH PW S D GCin GCout */
89
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 100, 50});
90
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 50, 100});
91
+ /*************** Stage 3: stride-1 units *************/
92
+ /* H W KH KW PH PW S D GCin GCout */
93
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 50});
94
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 50, 200});
95
+ /*************** Stage 4: stride-2 unit **************/
96
+ /* H W KH KW PH PW S D GCin GCout */
97
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 100});
98
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 100, 200});
99
+ /*************** Stage 4: stride-1 units *************/
100
+ /* H W KH KW PH PW S D GCin GCout */
101
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 400, 100});
102
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 100, 400});
103
+ }
104
+
105
+ // ShuffleNet v1 with 3 groups.
106
+ static void ShuffleNetV1G3ConvArguments(benchmark::internal::Benchmark* b) {
107
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
108
+
109
+ /*********************** Conv 1 **********************/
110
+ /* H W KH KW PH PW S D GCin GCout */
111
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
112
+ /*************** Stage 2: stride-2 unit **************/
113
+ /* H W KH KW PH PW S D GCin GCout */
114
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 60});
115
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 20, 72});
116
+ /*************** Stage 2: stride-1 units *************/
117
+ /* H W KH KW PH PW S D GCin GCout */
118
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 80, 20});
119
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 20, 80});
120
+ /*************** Stage 3: stride-2 unit **************/
121
+ /* H W KH KW PH PW S D GCin GCout */
122
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 80, 40});
123
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 80});
124
+ /*************** Stage 3: stride-1 units *************/
125
+ /* H W KH KW PH PW S D GCin GCout */
126
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 160, 40});
127
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 160});
128
+ /*************** Stage 4: stride-2 unit **************/
129
+ /* H W KH KW PH PW S D GCin GCout */
130
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 160, 80});
131
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 80, 160});
132
+ /*************** Stage 4: stride-1 units *************/
133
+ /* H W KH KW PH PW S D GCin GCout */
134
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 320, 80});
135
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 80, 320});
136
+ }
137
+
138
+ // ShuffleNet v1 with 4 groups.
139
+ static void ShuffleNetV1G4ConvArguments(benchmark::internal::Benchmark* b) {
140
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
141
+
142
+ /*********************** Conv 1 **********************/
143
+ /* H W KH KW PH PW S D GCin GCout */
144
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
145
+ /*************** Stage 2: stride-2 unit **************/
146
+ /* H W KH KW PH PW S D GCin GCout */
147
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 68});
148
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 17, 62});
149
+ /*************** Stage 2: stride-1 units *************/
150
+ /* H W KH KW PH PW S D GCin GCout */
151
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 68, 17});
152
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 17, 68});
153
+ /*************** Stage 3: stride-2 unit **************/
154
+ /* H W KH KW PH PW S D GCin GCout */
155
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 68, 34});
156
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 34, 68});
157
+ /*************** Stage 3: stride-1 units *************/
158
+ /* H W KH KW PH PW S D GCin GCout */
159
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 136, 34});
160
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 34, 136});
161
+ /*************** Stage 4: stride-2 unit **************/
162
+ /* H W KH KW PH PW S D GCin GCout */
163
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 136, 68});
164
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 68, 136});
165
+ /*************** Stage 4: stride-1 units *************/
166
+ /* H W KH KW PH PW S D GCin GCout */
167
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 272, 68});
168
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 68, 272});
169
+ }
170
+
171
+ // ShuffleNet v1 with 8 groups.
172
+ static void ShuffleNetV1G8ConvArguments(benchmark::internal::Benchmark* b) {
173
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
174
+
175
+ /*********************** Conv 1 **********************/
176
+ /* H W KH KW PH PW S D GCin GCout */
177
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
178
+ /*************** Stage 2: stride-2 unit **************/
179
+ /* H W KH KW PH PW S D GCin GCout */
180
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 96});
181
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 12, 45});
182
+ /*************** Stage 2: stride-1 units *************/
183
+ /* H W KH KW PH PW S D GCin GCout */
184
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 12});
185
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 12, 48});
186
+ /*************** Stage 3: stride-2 unit **************/
187
+ /* H W KH KW PH PW S D GCin GCout */
188
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 24});
189
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 24, 48});
190
+ /*************** Stage 3: stride-1 units *************/
191
+ /* H W KH KW PH PW S D GCin GCout */
192
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 24});
193
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 24, 96});
194
+ /*************** Stage 4: stride-2 unit **************/
195
+ /* H W KH KW PH PW S D GCin GCout */
196
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 48});
197
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 48, 96});
198
+ /*************** Stage 4: stride-1 units *************/
199
+ /* H W KH KW PH PW S D GCin GCout */
200
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 192, 48});
201
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 48, 192});
202
+ }
203
+
204
+ // ShuffleNet v2 (0.5X scale).
205
+ static void ShuffleNetV2X05ConvArguments(benchmark::internal::Benchmark* b) {
206
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
207
+
208
+ /*********************** Conv 1 **********************/
209
+ /* H W KH KW PH PW S D GCin GCout */
210
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
211
+ /********************** Stage 2 **********************/
212
+ /* H W KH KW PH PW S D GCin GCout */
213
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 24});
214
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 24});
215
+ /********************** Stage 3 **********************/
216
+ /* H W KH KW PH PW S D GCin GCout */
217
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 48});
218
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 48, 48});
219
+ /********************** Stage 4 **********************/
220
+ /* H W KH KW PH PW S D GCin GCout */
221
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 96});
222
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 96});
223
+ /*********************** Conv 5 **********************/
224
+ /* H W KH KW PH PW S D GCin GCout */
225
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 192, 1024});
226
+ }
227
+
228
+ // ShuffleNet v2 (1.0X scale).
229
+ static void ShuffleNetV2X10ConvArguments(benchmark::internal::Benchmark* b) {
230
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
231
+
232
+ /*********************** Conv 1 **********************/
233
+ /* H W KH KW PH PW S D GCin GCout */
234
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
235
+ /********************** Stage 2 **********************/
236
+ /* H W KH KW PH PW S D GCin GCout */
237
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 58});
238
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 58});
239
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 58, 58});
240
+ /********************** Stage 3 **********************/
241
+ /* H W KH KW PH PW S D GCin GCout */
242
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 116, 116});
243
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 116, 116});
244
+ /********************** Stage 4 **********************/
245
+ /* H W KH KW PH PW S D GCin GCout */
246
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 232, 232});
247
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 232, 232});
248
+ /*********************** Conv 5 **********************/
249
+ /* H W KH KW PH PW S D GCin GCout */
250
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 464, 1024});
251
+ }
252
+
253
+ // ShuffleNet v2 (1.5X scale).
254
+ static void ShuffleNetV2X15ConvArguments(benchmark::internal::Benchmark* b) {
255
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
256
+
257
+ /*********************** Conv 1 **********************/
258
+ /* H W KH KW PH PW S D GCin GCout */
259
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
260
+ /********************** Stage 2 **********************/
261
+ /* H W KH KW PH PW S D GCin GCout */
262
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 88});
263
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 88});
264
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 88, 88});
265
+ /********************** Stage 3 **********************/
266
+ /* H W KH KW PH PW S D GCin GCout */
267
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 176, 176});
268
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 176, 176});
269
+ /********************** Stage 4 **********************/
270
+ /* H W KH KW PH PW S D GCin GCout */
271
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 352, 352});
272
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 352, 352});
273
+ /*********************** Conv 5 **********************/
274
+ /* H W KH KW PH PW S D GCin GCout */
275
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 704, 1024});
276
+ }
277
+
278
+ // ShuffleNet v2 (2.0X scale).
279
+ static void ShuffleNetV2X20ConvArguments(benchmark::internal::Benchmark* b) {
280
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
281
+
282
+ /*********************** Conv 1 **********************/
283
+ /* H W KH KW PH PW S D GCin GCout */
284
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 24});
285
+ /********************** Stage 2 **********************/
286
+ /* H W KH KW PH PW S D GCin GCout */
287
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 122});
288
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 122});
289
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 122, 122});
290
+ /********************** Stage 3 **********************/
291
+ /* H W KH KW PH PW S D GCin GCout */
292
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 244, 244});
293
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 244, 244});
294
+ /********************** Stage 4 **********************/
295
+ /* H W KH KW PH PW S D GCin GCout */
296
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 488, 488});
297
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 488, 488});
298
+ /*********************** Conv 5 **********************/
299
+ /* H W KH KW PH PW S D GCin GCout */
300
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 976, 2048});
301
+ }
302
+
303
+ static void MobileNetV1ConvArguments(benchmark::internal::Benchmark* b) {
304
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
305
+
306
+ /* H W KH KW PH PW S D GCin GCout */
307
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 32});
308
+ b->Args({112, 112, 1, 1, 0, 0, 1, 1, 32, 64});
309
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 128});
310
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 128, 128});
311
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 256});
312
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 256, 256});
313
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 512});
314
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 512, 512});
315
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 1024});
316
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 1024, 1024});
317
+ }
318
+
319
+ static void MobileNetV2ConvArguments(benchmark::internal::Benchmark* b) {
320
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
321
+
322
+ /* H W KH KW PH PW S D GCin GCout */
323
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 32});
324
+
325
+ /******************** Bottleneck 1 *******************/
326
+ /* H W KH KW PH PW S D GCin GCout */
327
+ b->Args({112, 112, 1, 1, 0, 0, 1, 1, 32, 16});
328
+
329
+ /******************** Bottleneck 2 *******************/
330
+ /* H W KH KW PH PW S D GCin GCout */
331
+ b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 96});
332
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 96, 24});
333
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 144});
334
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 144, 24});
335
+
336
+ /******************** Bottleneck 3 *******************/
337
+ /* H W KH KW PH PW S D GCin GCout */
338
+ //b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 144});
339
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 144, 32});
340
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
341
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 192, 32});
342
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
343
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 192, 32});
344
+
345
+ /******************** Bottleneck 4 *******************/
346
+ /* H W KH KW PH PW S D GCin GCout */
347
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 32, 192});
348
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 192, 64});
349
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
350
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
351
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
352
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
353
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
354
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 64});
355
+
356
+ /******************** Bottleneck 5 *******************/
357
+ /* H W KH KW PH PW S D GCin GCout */
358
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 64, 384});
359
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 384, 96});
360
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
361
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 576, 96});
362
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
363
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 576, 96});
364
+
365
+ /******************** Bottleneck 6 *******************/
366
+ /* H W KH KW PH PW S D GCin GCout */
367
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 576});
368
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 160});
369
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
370
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
371
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
372
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
373
+
374
+ /******************** Bottleneck 7 *******************/
375
+ /* H W KH KW PH PW S D GCin GCout */
376
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
377
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 320});
378
+
379
+ /**************** Pre-pooling Conv2D *****************/
380
+ /* H W KH KW PH PW S D GCin GCout */
381
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 320, 1280});
382
+ /**************** Post-pooling Conv2D ****************/
383
+ /* H W KH KW PH PW S D GCin GCout */
384
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1280, 1000});
385
+ }
386
+
387
+ static void MobileNetV3SmallConvArguments(benchmark::internal::Benchmark* b) {
388
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
389
+
390
+ /******************* Initial Stage *******************/
391
+ /* H W KH KW PH PW S D GCin GCout */
392
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 16});
393
+ /******************** Bottleneck 1 *******************/
394
+ /* H W KH KW PH PW S D GCin GCout */
395
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 16, 8});
396
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 8, 16});
397
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 16, 16});
398
+ /******************** Bottleneck 2 *******************/
399
+ /* H W KH KW PH PW S D GCin GCout */
400
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 16, 72});
401
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 72, 24});
402
+ /******************** Bottleneck 3 *******************/
403
+ /* H W KH KW PH PW S D GCin GCout */
404
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 88});
405
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 88, 24});
406
+ /******************** Bottleneck 4 *******************/
407
+ /* H W KH KW PH PW S D GCin GCout */
408
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 24, 96});
409
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 96, 24});
410
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 24, 96});
411
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 96, 40});
412
+ /******************** Bottleneck 5 *******************/
413
+ /* H W KH KW PH PW S D GCin GCout */
414
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 240});
415
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 64});
416
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 64, 240});
417
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 40});
418
+ /******************** Bottleneck 6 *******************/
419
+ /* H W KH KW PH PW S D GCin GCout */
420
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 240});
421
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 64});
422
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 64, 240});
423
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 40});
424
+ /******************** Bottleneck 7 *******************/
425
+ /* H W KH KW PH PW S D GCin GCout */
426
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 40, 120});
427
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
428
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
429
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 120, 48});
430
+ /******************** Bottleneck 8 *******************/
431
+ /* H W KH KW PH PW S D GCin GCout */
432
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 144});
433
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 40});
434
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 40, 144});
435
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 144, 48});
436
+ /******************** Bottleneck 9 *******************/
437
+ /* H W KH KW PH PW S D GCin GCout */
438
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 48, 288});
439
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 288, 72});
440
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 72, 288});
441
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 288, 96});
442
+ /******************* Bottleneck 10 *******************/
443
+ /* H W KH KW PH PW S D GCin GCout */
444
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
445
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 144});
446
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 576});
447
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 96});
448
+ /******************* Bottleneck 11 *******************/
449
+ /* H W KH KW PH PW S D GCin GCout */
450
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
451
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 144});
452
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 144, 576});
453
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 576, 96});
454
+ /********************* Last Stage ********************/
455
+ /* H W KH KW PH PW S D GCin GCout */
456
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 96, 576});
457
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 576, 1024});
458
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1024, 1001});
459
+ }
460
+
461
+ static void MobileNetV3LargeConvArguments(benchmark::internal::Benchmark* b) {
462
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
463
+
464
+ /******************* Initial Stage *******************/
465
+ /* H W KH KW PH PW S D GCin GCout */
466
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 16});
467
+ /******************** Bottleneck 1 *******************/
468
+ /* H W KH KW PH PW S D GCin GCout */
469
+ b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 16});
470
+ /******************** Bottleneck 2 *******************/
471
+ /* H W KH KW PH PW S D GCin GCout */
472
+ b->Args({112, 112, 1, 1, 0, 0, 1, 1, 16, 64});
473
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 24});
474
+ /******************** Bottleneck 3 *******************/
475
+ /* H W KH KW PH PW S D GCin GCout */
476
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 72});
477
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 72, 24});
478
+ /******************** Bottleneck 4 *******************/
479
+ /* H W KH KW PH PW S D GCin GCout */
480
+ //b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 24, 72});*/
481
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 72, 24});
482
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 24, 72});
483
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 72, 40});
484
+ /******************** Bottleneck 5 *******************/
485
+ /* H W KH KW PH PW S D GCin GCout */
486
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 120});
487
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
488
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
489
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 120, 40});
490
+ /******************** Bottleneck 6 *******************/
491
+ /* H W KH KW PH PW S D GCin GCout */
492
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 120});
493
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 32});
494
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 32, 120});
495
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 120, 40});
496
+ /******************** Bottleneck 7 *******************/
497
+ /* H W KH KW PH PW S D GCin GCout */
498
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 40, 240});
499
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 240, 80});
500
+ /******************** Bottleneck 8 *******************/
501
+ /* H W KH KW PH PW S D GCin GCout */
502
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 200});
503
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 200, 80});
504
+ /******************** Bottleneck 9 *******************/
505
+ /* H W KH KW PH PW S D GCin GCout */
506
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 184});
507
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 184, 80});
508
+ /******************* Bottleneck 10 *******************/
509
+ /* H W KH KW PH PW S D GCin GCout */
510
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 184});
511
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 184, 80});
512
+ /******************* Bottleneck 11 *******************/
513
+ /* H W KH KW PH PW S D GCin GCout */
514
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 80, 480});
515
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 480, 120});
516
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 120, 480});
517
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 480, 112});
518
+ /******************* Bottleneck 12 *******************/
519
+ /* H W KH KW PH PW S D GCin GCout */
520
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 112, 672});
521
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 672, 168});
522
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 168, 672});
523
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 672, 112});
524
+ /******************* Bottleneck 13 *******************/
525
+ /* H W KH KW PH PW S D GCin GCout */
526
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 112, 672});
527
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 672, 160});
528
+ /******************* Bottleneck 14 *******************/
529
+ /* H W KH KW PH PW S D GCin GCout */
530
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
531
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 240});
532
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 960});
533
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
534
+ /******************* Bottleneck 15 *******************/
535
+ /* H W KH KW PH PW S D GCin GCout */
536
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
537
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 240});
538
+ //b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 240, 960});
539
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 960, 160});
540
+ /******************** Last Stage *********************/
541
+ /* H W KH KW PH PW S D GCin GCout */
542
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 160, 960});
543
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 960, 1280});
544
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 1280, 1001});
545
+ }
546
+
547
+ // SqueezeNet 1.0
548
+ static void SqueezeNetV10ConvArguments(benchmark::internal::Benchmark* b) {
549
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
550
+
551
+ /*********************** Conv 1 **********************/
552
+ /* H W KH KW PH PW S D GCin GCout */
553
+ b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 96});
554
+ /*********************** Fire 2 **********************/
555
+ /* H W KH KW PH PW S D GCin GCout */
556
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 96, 16});
557
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
558
+ b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
559
+ /*********************** Fire 3 **********************/
560
+ /* H W KH KW PH PW S D GCin GCout */
561
+ b->Args({ 56, 55, 1, 1, 0, 0, 1, 1, 128, 16});
562
+ //b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
563
+ //b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
564
+ /*********************** Fire 4 **********************/
565
+ /* H W KH KW PH PW S D GCin GCout */
566
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 128, 32});
567
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 32, 128});
568
+ b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 32, 128});
569
+ /*********************** Fire 5 **********************/
570
+ /* H W KH KW PH PW S D GCin GCout */
571
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 32});
572
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
573
+ b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
574
+ /*********************** Fire 6 **********************/
575
+ /* H W KH KW PH PW S D GCin GCout */
576
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 48});
577
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 48, 192});
578
+ b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 48, 192});
579
+ /*********************** Fire 7 **********************/
580
+ /* H W KH KW PH PW S D GCin GCout */
581
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 384, 48});
582
+ //b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 48, 192});
583
+ //b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 48, 192});
584
+ /*********************** Fire 8 **********************/
585
+ /* H W KH KW PH PW S D GCin GCout */
586
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 384, 64});
587
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 64, 256});
588
+ b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 64, 256});
589
+ /*********************** Fire 9 **********************/
590
+ /* H W KH KW PH PW S D GCin GCout */
591
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 64});
592
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
593
+ b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
594
+ /********************** Conv 10 **********************/
595
+ /* H W KH KW PH PW S D GCin GCout */
596
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 1000});
597
+ }
598
+
599
+ // SqueezeNet 1.1
600
+ static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
601
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
602
+
603
+ /*********************** Conv 1 **********************/
604
+ /* H W KH KW PH PW S D GCin GCout */
605
+ b->Args({224, 224, 3, 3, 2, 2, 2, 1, 3, 64});
606
+ /*********************** Fire 2 **********************/
607
+ /* H W KH KW PH PW S D GCin GCout */
608
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 64, 16});
609
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
610
+ b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
611
+ /*********************** Fire 3 **********************/
612
+ /* H W KH KW PH PW S D GCin GCout */
613
+ b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 128, 16});
614
+ //b->Args({ 55, 55, 1, 1, 0, 0, 1, 1, 16, 64});
615
+ //b->Args({ 55, 55, 3, 3, 2, 2, 1, 1, 16, 64});
616
+ /*********************** Fire 4 **********************/
617
+ /* H W KH KW PH PW S D GCin GCout */
618
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 128, 32});
619
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
620
+ b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
621
+ /*********************** Fire 5 **********************/
622
+ /* H W KH KW PH PW S D GCin GCout */
623
+ b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 256, 32});
624
+ //b->Args({ 27, 27, 1, 1, 0, 0, 1, 1, 32, 128});
625
+ //b->Args({ 27, 27, 3, 3, 2, 2, 1, 1, 32, 128});
626
+ /*********************** Fire 6 **********************/
627
+ /* H W KH KW PH PW S D GCin GCout */
628
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 256, 48});
629
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 48, 192});
630
+ b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 48, 192});
631
+ /*********************** Fire 7 **********************/
632
+ /* H W KH KW PH PW S D GCin GCout */
633
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 384, 48});
634
+ //b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 48, 192});
635
+ //b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 48, 192});
636
+ /*********************** Fire 8 **********************/
637
+ /* H W KH KW PH PW S D GCin GCout */
638
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 384, 64});
639
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
640
+ b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
641
+ /*********************** Fire 9 **********************/
642
+ /* H W KH KW PH PW S D GCin GCout */
643
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 64});
644
+ //b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 64, 256});
645
+ //b->Args({ 13, 13, 3, 3, 2, 2, 1, 1, 64, 256});
646
+ /********************** Conv 10 **********************/
647
+ /* H W KH KW PH PW S D GCin GCout */
648
+ b->Args({ 13, 13, 1, 1, 0, 0, 1, 1, 512, 1000});
649
+ }
650
+
651
+ static void InceptionV3ConvArguments(benchmark::internal::Benchmark* b) {
652
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
653
+
654
+ /* H W KH KW PH PW S D GCin GCout */
655
+ b->Args({299, 299, 3, 3, 0, 0, 2, 1, 3, 32});
656
+ b->Args({149, 149, 3, 3, 0, 0, 1, 1, 32, 32});
657
+ b->Args({147, 147, 3, 3, 2, 2, 1, 1, 32, 64});
658
+ b->Args({ 73, 73, 1, 1, 0, 0, 1, 1, 64, 80});
659
+ b->Args({ 73, 73, 3, 3, 0, 0, 1, 1, 80, 192});
660
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 64});
661
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 48});
662
+ b->Args({ 35, 35, 5, 5, 4, 4, 1, 1, 48, 64});
663
+ b->Args({ 35, 35, 3, 3, 2, 2, 1, 1, 64, 96});
664
+ b->Args({ 35, 35, 3, 3, 2, 2, 1, 1, 96, 96});
665
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 192, 32});
666
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 256, 64});
667
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 256, 48});
668
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 288, 64});
669
+ b->Args({ 35, 35, 1, 1, 0, 0, 1, 1, 288, 48});
670
+ b->Args({ 35, 35, 3, 3, 0, 0, 2, 1, 288, 384});
671
+ b->Args({ 35, 35, 3, 3, 0, 0, 2, 1, 96, 96});
672
+ b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 192});
673
+ b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 128});
674
+ b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 128, 128});
675
+ b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 128, 192});
676
+ b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 128, 128});
677
+ b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 128, 192});
678
+ b->Args({ 17, 17, 1, 1, 0, 0, 1, 1, 768, 160});
679
+ b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 160, 160});
680
+ b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 160, 192});
681
+ b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 160, 160});
682
+ b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 160, 192});
683
+ b->Args({ 17, 17, 1, 7, 0, 6, 1, 1, 192, 192});
684
+ b->Args({ 17, 17, 7, 1, 6, 0, 1, 1, 192, 192});
685
+ b->Args({ 17, 17, 3, 3, 0, 0, 2, 1, 192, 320});
686
+ b->Args({ 17, 17, 3, 3, 0, 0, 2, 1, 192, 192});
687
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 320});
688
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 384});
689
+ b->Args({ 8, 8, 1, 3, 0, 2, 1, 1, 384, 384});
690
+ b->Args({ 8, 8, 3, 1, 2, 0, 1, 1, 384, 384});
691
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 448});
692
+ b->Args({ 8, 8, 3, 3, 2, 2, 1, 1, 448, 384});
693
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 1280, 192});
694
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 320});
695
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 384});
696
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 448});
697
+ b->Args({ 8, 8, 1, 1, 0, 0, 1, 1, 2048, 192});
698
+ b->Args({ 1, 1, 1, 1, 0, 0, 1, 1, 2048, 1001});
699
+ }
700
+
701
+ static void ResNet18ConvArguments(benchmark::internal::Benchmark* b) {
702
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
703
+
704
+ /********************** Conv 1 ***********************/
705
+ /* H W KH KW PH PW S D GCin GCout */
706
+ b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 64});
707
+ /********************* Conv 2.X **********************/
708
+ /* H W KH KW PH PW S D GCin GCout */
709
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
710
+ /********************* Conv 3.X **********************/
711
+ /* H W KH KW PH PW S D GCin GCout */
712
+ b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 64, 128});
713
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 128, 128});
714
+ b->Args({ 56, 56, 1, 1, 0, 0, 2, 1, 64, 128});
715
+ /********************* Conv 4.X **********************/
716
+ /* H W KH KW PH PW S D GCin GCout */
717
+ b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 128, 256});
718
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 256, 256});
719
+ b->Args({ 28, 28, 1, 1, 0, 0, 2, 1, 128, 256});
720
+ /********************* Conv 5.X **********************/
721
+ /* H W KH KW PH PW S D GCin GCout */
722
+ b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 256, 512});
723
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 512, 512});
724
+ b->Args({ 14, 14, 1, 1, 0, 0, 2, 1, 256, 512});
725
+ }
726
+
727
+ static void ResNet50ConvArguments(benchmark::internal::Benchmark* b) {
728
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
729
+
730
+ /********************** Conv 1 ***********************/
731
+ /* H W KH KW PH PW S D GCin GCout */
732
+ b->Args({224, 224, 7, 7, 6, 6, 2, 1, 3, 64});
733
+ /********************* Conv 2.1 **********************/
734
+ /* H W KH KW PH PW S D GCin GCout */
735
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 64});
736
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
737
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
738
+ //b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
739
+ /********************* Conv 2.X **********************/
740
+ /* H W KH KW PH PW S D GCin GCout */
741
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 64});
742
+ //b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 64, 64});
743
+ //b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 64, 256});
744
+ /********************** Conv 3.1 *********************/
745
+ /* H W KH KW PH PW S D GCin GCout */
746
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 128});
747
+ b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 128, 128});
748
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 512});
749
+ b->Args({ 56, 56, 1, 1, 0, 0, 2, 1, 256, 512});
750
+ /********************** Conv 3.X *********************/
751
+ /* H W KH KW PH PW S D GCin GCout */
752
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 128});
753
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 128, 128});
754
+ //b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 128, 512});
755
+ /********************** Conv 4.1 *********************/
756
+ /* H W KH KW PH PW S D GCin GCout */
757
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 256});
758
+ b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 256, 256});
759
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 1024});
760
+ b->Args({ 28, 28, 1, 1, 0, 0, 2, 1, 512, 1024});
761
+ /********************** Conv 4.X *********************/
762
+ /* H W KH KW PH PW S D GCin GCout */
763
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 1024, 256});
764
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 256, 256});
765
+ //b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 256, 1024});
766
+ /********************** Conv 5.1 *********************/
767
+ /* H W KH KW PH PW S D GCin GCout */
768
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 1024, 512});
769
+ b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 512, 512});
770
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 2048});
771
+ b->Args({ 14, 14, 1, 1, 0, 0, 2, 1, 1024, 2048});
772
+ /********************** Conv 5.X *********************/
773
+ /* H W KH KW PH PW S D GCin GCout */
774
+ b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 2048, 512});
775
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 512, 512});
776
+ //b->Args({ 7, 7, 1, 1, 0, 0, 1, 1, 512, 2048});
777
+ }
778
+
779
+ static void VGGConvArguments(benchmark::internal::Benchmark* b) {
780
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
781
+
782
+ /********************** Conv 1.1 *********************/
783
+ /* H W KH KW PH PW S D GCin GCout */
784
+ b->Args({224, 224, 3, 3, 2, 2, 1, 1, 3, 64});
785
+ /********************** Conv 1.2 *********************/
786
+ /* H W KH KW PH PW S D GCin GCout */
787
+ b->Args({224, 224, 3, 3, 2, 2, 1, 1, 64, 64});
788
+
789
+ /********************** Conv 2.1 *********************/
790
+ /* H W KH KW PH PW S D GCin GCout */
791
+ b->Args({112, 112, 3, 3, 2, 2, 1, 1, 64, 128});
792
+ /********************** Conv 2.2 *********************/
793
+ /* H W KH KW PH PW S D GCin GCout */
794
+ b->Args({112, 112, 3, 3, 2, 2, 1, 1, 128, 128});
795
+
796
+ /********************** Conv 3.1 *********************/
797
+ /* H W KH KW PH PW S D GCin GCout */
798
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 128, 256});
799
+ /********************** Conv 3.2 *********************/
800
+ /* H W KH KW PH PW S D GCin GCout */
801
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 256, 256});
802
+ /********************** Conv 3.3 *********************/
803
+ /* H W KH KW PH PW S D GCin GCout */
804
+ b->Args({ 56, 56, 1, 1, 0, 0, 1, 1, 256, 256});
805
+
806
+ /********************** Conv 4.1 *********************/
807
+ /* H W KH KW PH PW S D GCin GCout */
808
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 256, 512});
809
+ /********************** Conv 4.2 *********************/
810
+ /* H W KH KW PH PW S D GCin GCout */
811
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 512, 512});
812
+ /********************** Conv 4.3 *********************/
813
+ /* H W KH KW PH PW S D GCin GCout */
814
+ b->Args({ 28, 28, 1, 1, 0, 0, 1, 1, 512, 512});
815
+
816
+ /********************** Conv 5.X *********************/
817
+ /* H W KH KW PH PW S D GCin GCout */
818
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 512, 512});
819
+ /********************** Conv 5.3 *********************/
820
+ /* H W KH KW PH PW S D GCin GCout */
821
+ b->Args({ 14, 14, 1, 1, 0, 0, 1, 1, 512, 512});
822
+ }
823
+
824
+ // SRCNN (9-1-5)
825
+ static void SRCNN915ConvArguments(benchmark::internal::Benchmark* b) {
826
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
827
+
828
+ /* H W KH KW PH PW S D GCin GCout */
829
+ b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
830
+ b->Args({376, 376, 1, 1, 0, 0, 1, 1, 64, 32});
831
+ b->Args({376, 376, 5, 5, 0, 0, 1, 1, 32, 1});
832
+ }
833
+
834
+ // SRCNN (9-3-5)
835
+ static void SRCNN935ConvArguments(benchmark::internal::Benchmark* b) {
836
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
837
+
838
+ /* H W KH KW PH PW S D GCin GCout */
839
+ b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
840
+ b->Args({376, 376, 3, 3, 0, 0, 1, 1, 64, 32});
841
+ b->Args({374, 374, 5, 5, 0, 0, 1, 1, 32, 1});
842
+ }
843
+
844
+ // SRCNN (9-5-5)
845
+ static void SRCNN955ConvArguments(benchmark::internal::Benchmark* b) {
846
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
847
+
848
+ /* H W KH KW PH PW S D GCin GCout */
849
+ b->Args({384, 384, 9, 9, 0, 0, 1, 1, 1, 64});
850
+ b->Args({376, 376, 5, 5, 0, 0, 1, 1, 64, 32});
851
+ b->Args({372, 372, 5, 5, 0, 0, 1, 1, 32, 1});
852
+ }
bench/convert.cc ADDED
@@ -0,0 +1,1339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2021 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cfloat>
9
+ #include <cmath>
10
+ #include <functional>
11
+ #include <limits>
12
+ #include <memory>
13
+ #include <random>
14
+ #include <vector>
15
+
16
+ #include <xnnpack.h>
17
+
18
+ #include <benchmark/benchmark.h>
19
+ #include <fp16/fp16.h>
20
+ #include "bench/utils.h"
21
+ #ifdef BENCHMARK_TENSORFLOW_LITE
22
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
23
+ #include "tensorflow/lite/interpreter.h"
24
+ #include "tensorflow/lite/kernels/register.h"
25
+ #include "tensorflow/lite/model.h"
26
+ #include "tensorflow/lite/schema/schema_generated.h"
27
+ #include "tensorflow/lite/version.h"
28
+ #endif // BENCHMARK_TENSORFLOW_LITE
29
+
30
+
31
+ void xnnpack_convert_f16_f32(benchmark::State& state) {
32
+ const size_t batch_size = state.range(0);
33
+
34
+ std::random_device random_device;
35
+ auto rng = std::mt19937(random_device());
36
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
37
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
38
+
39
+ std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
40
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
41
+ std::vector<float> output(batch_size);
42
+ std::fill(output.begin(), output.end(), std::nanf(""));
43
+
44
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
45
+ if (status != xnn_status_success) {
46
+ state.SkipWithError("failed to initialize XNNPACK");
47
+ return;
48
+ }
49
+
50
+ xnn_operator_t convert_op = nullptr;
51
+ status = xnn_create_convert_nc_f16_f32(
52
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
53
+ 0 /* flags */, &convert_op);
54
+ if (status != xnn_status_success) {
55
+ state.SkipWithError("failed to create F16->F32 Convert operator");
56
+ return;
57
+ }
58
+
59
+ status = xnn_reshape_convert_nc_f16_f32(convert_op, batch_size, /*threadpool=*/nullptr);
60
+ if (status != xnn_status_success) {
61
+ state.SkipWithError("failed to reshape F16->F32 Convert operator");
62
+ return;
63
+ }
64
+
65
+ status = xnn_setup_convert_nc_f16_f32(convert_op, input.data(), output.data());
66
+ if (status != xnn_status_success) {
67
+ state.SkipWithError("failed to setup F16->F32 Convert operator");
68
+ return;
69
+ }
70
+
71
+ for (auto _ : state) {
72
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
73
+ if (status != xnn_status_success) {
74
+ state.SkipWithError("failed to run F16->F32 Convert operator");
75
+ return;
76
+ }
77
+ }
78
+
79
+ status = xnn_delete_operator(convert_op);
80
+ if (status != xnn_status_success) {
81
+ state.SkipWithError("failed to delete F16->F32 Convert operator");
82
+ return;
83
+ }
84
+ convert_op = nullptr;
85
+
86
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
87
+ if (cpu_frequency != 0) {
88
+ state.counters["cpufreq"] = cpu_frequency;
89
+ }
90
+
91
+ state.counters["elements"] =
92
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
93
+
94
+ const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
95
+ state.counters["bytes"] =
96
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
97
+ }
98
+
99
+ void xnnpack_convert_f32_f16(benchmark::State& state) {
100
+ const size_t batch_size = state.range(0);
101
+
102
+ std::random_device random_device;
103
+ auto rng = std::mt19937(random_device());
104
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
105
+
106
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
107
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
108
+ std::vector<uint16_t> output(batch_size);
109
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
110
+
111
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
112
+ if (status != xnn_status_success) {
113
+ state.SkipWithError("failed to initialize XNNPACK");
114
+ return;
115
+ }
116
+
117
+ xnn_operator_t convert_op = nullptr;
118
+ status = xnn_create_convert_nc_f32_f16(
119
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
120
+ 0 /* flags */, &convert_op);
121
+ if (status != xnn_status_success) {
122
+ state.SkipWithError("failed to create F32->F16 Convert operator");
123
+ return;
124
+ }
125
+
126
+ status = xnn_reshape_convert_nc_f32_f16(convert_op, batch_size, /*threadpool=*/nullptr);
127
+ if (status != xnn_status_success) {
128
+ state.SkipWithError("failed to reshape F32->F16 Convert operator");
129
+ return;
130
+ }
131
+
132
+ status = xnn_setup_convert_nc_f32_f16(convert_op, input.data(), output.data());
133
+ if (status != xnn_status_success) {
134
+ state.SkipWithError("failed to setup F32->F16 Convert operator");
135
+ return;
136
+ }
137
+
138
+ for (auto _ : state) {
139
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
140
+ if (status != xnn_status_success) {
141
+ state.SkipWithError("failed to run F32->F16 Convert operator");
142
+ return;
143
+ }
144
+ }
145
+
146
+ status = xnn_delete_operator(convert_op);
147
+ if (status != xnn_status_success) {
148
+ state.SkipWithError("failed to delete F32->F16 Convert operator");
149
+ return;
150
+ }
151
+ convert_op = nullptr;
152
+
153
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
154
+ if (cpu_frequency != 0) {
155
+ state.counters["cpufreq"] = cpu_frequency;
156
+ }
157
+
158
+ state.counters["elements"] =
159
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
160
+
161
+ const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint16_t));
162
+ state.counters["bytes"] =
163
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
164
+ }
165
+
166
+ void xnnpack_convert_f32_qs8(benchmark::State& state) {
167
+ const size_t batch_size = state.range(0);
168
+
169
+ std::random_device random_device;
170
+ auto rng = std::mt19937(random_device());
171
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
172
+
173
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
174
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
175
+ std::vector<int8_t> output(batch_size);
176
+ std::fill(output.begin(), output.end(), 0);
177
+
178
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
179
+ if (status != xnn_status_success) {
180
+ state.SkipWithError("failed to initialize XNNPACK");
181
+ return;
182
+ }
183
+
184
+ xnn_operator_t convert_op = nullptr;
185
+ status = xnn_create_convert_nc_f32_qs8(
186
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
187
+ 1.0f / 128.0f /* scale */, 1 /* zero point */,
188
+ std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
189
+ 0 /* flags */, &convert_op);
190
+ if (status != xnn_status_success) {
191
+ state.SkipWithError("failed to create F32->QS8 Convert operator");
192
+ return;
193
+ }
194
+
195
+ status = xnn_reshape_convert_nc_f32_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
196
+ if (status != xnn_status_success) {
197
+ state.SkipWithError("failed to reshape F32->QS8 Convert operator");
198
+ return;
199
+ }
200
+
201
+ status = xnn_setup_convert_nc_f32_qs8(convert_op, input.data(), output.data());
202
+ if (status != xnn_status_success) {
203
+ state.SkipWithError("failed to setup F32->QS8 Convert operator");
204
+ return;
205
+ }
206
+
207
+ for (auto _ : state) {
208
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
209
+ if (status != xnn_status_success) {
210
+ state.SkipWithError("failed to run F32->QS8 Convert operator");
211
+ return;
212
+ }
213
+ }
214
+
215
+ status = xnn_delete_operator(convert_op);
216
+ if (status != xnn_status_success) {
217
+ state.SkipWithError("failed to delete F32->QS8 Convert operator");
218
+ return;
219
+ }
220
+ convert_op = nullptr;
221
+
222
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
223
+ if (cpu_frequency != 0) {
224
+ state.counters["cpufreq"] = cpu_frequency;
225
+ }
226
+
227
+ state.counters["elements"] =
228
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
229
+
230
+ const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
231
+ state.counters["bytes"] =
232
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
233
+ }
234
+
235
+ void xnnpack_convert_f32_qu8(benchmark::State& state) {
236
+ const size_t batch_size = state.range(0);
237
+
238
+ std::random_device random_device;
239
+ auto rng = std::mt19937(random_device());
240
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
241
+
242
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
243
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
244
+ std::vector<uint8_t> output(batch_size);
245
+ std::fill(output.begin(), output.end(), 0);
246
+
247
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
248
+ if (status != xnn_status_success) {
249
+ state.SkipWithError("failed to initialize XNNPACK");
250
+ return;
251
+ }
252
+
253
+ xnn_operator_t convert_op = nullptr;
254
+ status = xnn_create_convert_nc_f32_qu8(
255
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
256
+ 1.0f / 128.0f /* scale */, 127 /* zero point */,
257
+ std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max(),
258
+ 0 /* flags */, &convert_op);
259
+ if (status != xnn_status_success) {
260
+ state.SkipWithError("failed to create F32->QU8 Convert operator");
261
+ return;
262
+ }
263
+
264
+ status = xnn_reshape_convert_nc_f32_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
265
+ if (status != xnn_status_success) {
266
+ state.SkipWithError("failed to reshape F32->QU8 Convert operator");
267
+ return;
268
+ }
269
+
270
+ status = xnn_setup_convert_nc_f32_qu8(convert_op, input.data(), output.data());
271
+ if (status != xnn_status_success) {
272
+ state.SkipWithError("failed to setup F32->QU8 Convert operator");
273
+ return;
274
+ }
275
+
276
+ for (auto _ : state) {
277
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
278
+ if (status != xnn_status_success) {
279
+ state.SkipWithError("failed to run F32->QU8 Convert operator");
280
+ return;
281
+ }
282
+ }
283
+
284
+ status = xnn_delete_operator(convert_op);
285
+ if (status != xnn_status_success) {
286
+ state.SkipWithError("failed to delete F32->QU8 Convert operator");
287
+ return;
288
+ }
289
+ convert_op = nullptr;
290
+
291
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
292
+ if (cpu_frequency != 0) {
293
+ state.counters["cpufreq"] = cpu_frequency;
294
+ }
295
+
296
+ state.counters["elements"] =
297
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
298
+
299
+ const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
300
+ state.counters["bytes"] =
301
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
302
+ }
303
+
304
+ void xnnpack_convert_qs8(benchmark::State& state) {
305
+ const size_t batch_size = state.range(0);
306
+
307
+ std::random_device random_device;
308
+ auto rng = std::mt19937(random_device());
309
+ auto i8rng = std::bind(
310
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
311
+ std::ref(rng));
312
+
313
+ std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
314
+ std::generate(input.begin(), input.end(), std::ref(i8rng));
315
+ std::vector<int8_t> output(batch_size);
316
+ std::fill(output.begin(), output.end(), INT8_C(0xAA));
317
+
318
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
319
+ if (status != xnn_status_success) {
320
+ state.SkipWithError("failed to initialize XNNPACK");
321
+ return;
322
+ }
323
+
324
+ xnn_operator_t convert_op = nullptr;
325
+ status = xnn_create_convert_nc_qs8(
326
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
327
+ 0.75f /* input scale */, -1 /* input zero point */,
328
+ 0.5f /* output scale */, 1 /* output zero point */,
329
+ 0 /* flags */, &convert_op);
330
+ if (status != xnn_status_success) {
331
+ state.SkipWithError("failed to create QS8 Convert operator");
332
+ return;
333
+ }
334
+
335
+ status = xnn_reshape_convert_nc_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
336
+ if (status != xnn_status_success) {
337
+ state.SkipWithError("failed to reshape QS8 Convert operator");
338
+ return;
339
+ }
340
+
341
+ status = xnn_setup_convert_nc_qs8(convert_op, input.data(), output.data());
342
+ if (status != xnn_status_success) {
343
+ state.SkipWithError("failed to setup QS8 Convert operator");
344
+ return;
345
+ }
346
+
347
+ for (auto _ : state) {
348
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
349
+ if (status != xnn_status_success) {
350
+ state.SkipWithError("failed to run QS8 Convert operator");
351
+ return;
352
+ }
353
+ }
354
+
355
+ status = xnn_delete_operator(convert_op);
356
+ if (status != xnn_status_success) {
357
+ state.SkipWithError("failed to delete QS8 Convert operator");
358
+ return;
359
+ }
360
+ convert_op = nullptr;
361
+
362
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
363
+ if (cpu_frequency != 0) {
364
+ state.counters["cpufreq"] = cpu_frequency;
365
+ }
366
+
367
+ state.counters["elements"] =
368
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
369
+
370
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
371
+ state.counters["bytes"] =
372
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
373
+ }
374
+
375
+ void xnnpack_convert_qs8_f32(benchmark::State& state) {
376
+ const size_t batch_size = state.range(0);
377
+
378
+ std::random_device random_device;
379
+ auto rng = std::mt19937(random_device());
380
+ auto i8rng = std::bind(
381
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
382
+ std::ref(rng));
383
+
384
+ std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
385
+ std::generate(input.begin(), input.end(), std::ref(i8rng));
386
+ std::vector<float> output(batch_size);
387
+ std::fill(output.begin(), output.end(), std::nanf(""));
388
+
389
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
390
+ if (status != xnn_status_success) {
391
+ state.SkipWithError("failed to initialize XNNPACK");
392
+ return;
393
+ }
394
+
395
+ xnn_operator_t convert_op = nullptr;
396
+ status = xnn_create_convert_nc_qs8_f32(
397
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
398
+ 1.0f / 255.0f /* scale */, -128 /* zero point */,
399
+ 0 /* flags */, &convert_op);
400
+ if (status != xnn_status_success) {
401
+ state.SkipWithError("failed to create QS8->F32 Convert operator");
402
+ return;
403
+ }
404
+
405
+ status = xnn_reshape_convert_nc_qs8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
406
+ if (status != xnn_status_success) {
407
+ state.SkipWithError("failed to reshape QS8->F32 Convert operator");
408
+ return;
409
+ }
410
+
411
+ status = xnn_setup_convert_nc_qs8_f32(convert_op, input.data(), output.data());
412
+ if (status != xnn_status_success) {
413
+ state.SkipWithError("failed to setup QS8->F32 Convert operator");
414
+ return;
415
+ }
416
+
417
+ for (auto _ : state) {
418
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
419
+ if (status != xnn_status_success) {
420
+ state.SkipWithError("failed to run QS8->F32 Convert operator");
421
+ return;
422
+ }
423
+ }
424
+
425
+ status = xnn_delete_operator(convert_op);
426
+ if (status != xnn_status_success) {
427
+ state.SkipWithError("failed to delete QS8->F32 Convert operator");
428
+ return;
429
+ }
430
+ convert_op = nullptr;
431
+
432
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
433
+ if (cpu_frequency != 0) {
434
+ state.counters["cpufreq"] = cpu_frequency;
435
+ }
436
+
437
+ state.counters["elements"] =
438
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
439
+
440
+ const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
441
+ state.counters["bytes"] =
442
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
443
+ }
444
+
445
+ void xnnpack_convert_qu8(benchmark::State& state) {
446
+ const size_t batch_size = state.range(0);
447
+
448
+ std::random_device random_device;
449
+ auto rng = std::mt19937(random_device());
450
+ auto u8rng = std::bind(
451
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
452
+ std::ref(rng));
453
+
454
+ std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
455
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
456
+ std::vector<uint8_t> output(batch_size);
457
+ std::fill(output.begin(), output.end(), UINT8_C(0xAA));
458
+
459
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
460
+ if (status != xnn_status_success) {
461
+ state.SkipWithError("failed to initialize XNNPACK");
462
+ return;
463
+ }
464
+
465
+ xnn_operator_t convert_op = nullptr;
466
+ status = xnn_create_convert_nc_qu8(
467
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
468
+ 0.75f /* scale */, 125 /* zero point */,
469
+ 0.5f /* scale */, 130 /* zero point */,
470
+ 0 /* flags */, &convert_op);
471
+ if (status != xnn_status_success) {
472
+ state.SkipWithError("failed to create QU8 Convert operator");
473
+ return;
474
+ }
475
+
476
+ status = xnn_reshape_convert_nc_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
477
+ if (status != xnn_status_success) {
478
+ state.SkipWithError("failed to reshape QU8 Convert operator");
479
+ return;
480
+ }
481
+
482
+ status = xnn_setup_convert_nc_qu8(convert_op, input.data(), output.data());
483
+ if (status != xnn_status_success) {
484
+ state.SkipWithError("failed to setup QU8 Convert operator");
485
+ return;
486
+ }
487
+
488
+ for (auto _ : state) {
489
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
490
+ if (status != xnn_status_success) {
491
+ state.SkipWithError("failed to run QU8 Convert operator");
492
+ return;
493
+ }
494
+ }
495
+
496
+ status = xnn_delete_operator(convert_op);
497
+ if (status != xnn_status_success) {
498
+ state.SkipWithError("failed to delete QU8 Convert operator");
499
+ return;
500
+ }
501
+ convert_op = nullptr;
502
+
503
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
504
+ if (cpu_frequency != 0) {
505
+ state.counters["cpufreq"] = cpu_frequency;
506
+ }
507
+
508
+ state.counters["elements"] =
509
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
510
+
511
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
512
+ state.counters["bytes"] =
513
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
514
+ }
515
+
516
+ void xnnpack_convert_qu8_f32(benchmark::State& state) {
517
+ const size_t batch_size = state.range(0);
518
+
519
+ std::random_device random_device;
520
+ auto rng = std::mt19937(random_device());
521
+ auto u8rng = std::bind(
522
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
523
+ std::ref(rng));
524
+
525
+ std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
526
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
527
+ std::vector<float> output(batch_size);
528
+ std::fill(output.begin(), output.end(), std::nanf(""));
529
+
530
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
531
+ if (status != xnn_status_success) {
532
+ state.SkipWithError("failed to initialize XNNPACK");
533
+ return;
534
+ }
535
+
536
+ xnn_operator_t convert_op = nullptr;
537
+ status = xnn_create_convert_nc_qu8_f32(
538
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
539
+ 1.0f / 128.0f /* scale */, 128 /* zero point */,
540
+ 0 /* flags */, &convert_op);
541
+ if (status != xnn_status_success) {
542
+ state.SkipWithError("failed to create QU8->F32 Convert operator");
543
+ return;
544
+ }
545
+
546
+ status = xnn_reshape_convert_nc_qu8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
547
+ if (status != xnn_status_success) {
548
+ state.SkipWithError("failed to reshape QU8->F32 Convert operator");
549
+ return;
550
+ }
551
+
552
+ status = xnn_setup_convert_nc_qu8_f32(convert_op, input.data(), output.data());
553
+ if (status != xnn_status_success) {
554
+ state.SkipWithError("failed to setup QU8->F32 Convert operator");
555
+ return;
556
+ }
557
+
558
+ for (auto _ : state) {
559
+ status = xnn_run_operator(convert_op, nullptr /* thread pool */);
560
+ if (status != xnn_status_success) {
561
+ state.SkipWithError("failed to run QU8->F32 Convert operator");
562
+ return;
563
+ }
564
+ }
565
+
566
+ status = xnn_delete_operator(convert_op);
567
+ if (status != xnn_status_success) {
568
+ state.SkipWithError("failed to delete QU8->F32 Convert operator");
569
+ return;
570
+ }
571
+ convert_op = nullptr;
572
+
573
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
574
+ if (cpu_frequency != 0) {
575
+ state.counters["cpufreq"] = cpu_frequency;
576
+ }
577
+
578
+ state.counters["elements"] =
579
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
580
+
581
+ const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
582
+ state.counters["bytes"] =
583
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
584
+ }
585
+
586
+ #ifdef BENCHMARK_TENSORFLOW_LITE
587
+ void tflite_convert_f16_f32(benchmark::State& state) {
588
+ const size_t batch_size = state.range(0);
589
+
590
+ std::random_device random_device;
591
+ auto rng = std::mt19937(random_device());
592
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
593
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
594
+
595
+ flatbuffers::FlatBufferBuilder builder;
596
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
597
+ CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
598
+
599
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
600
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
601
+ }};
602
+
603
+ const std::array<int32_t, 1> shape{{
604
+ static_cast<int32_t>(batch_size)
605
+ }};
606
+
607
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
608
+ tflite::CreateTensor(builder,
609
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
610
+ tflite::TensorType_FLOAT16),
611
+ tflite::CreateTensor(builder,
612
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
613
+ tflite::TensorType_FLOAT32)
614
+ }};
615
+
616
+ const std::array<int32_t, 1> op_inputs{{0}};
617
+ const std::array<int32_t, 1> op_outputs{{1}};
618
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
619
+ 0 /* opcode_index */,
620
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
621
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
622
+
623
+ const std::array<int32_t, 1> graph_inputs{{0}};
624
+ const std::array<int32_t, 1> graph_outputs{{1}};
625
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
626
+ builder,
627
+ builder.CreateVector(tensors.data(), tensors.size()),
628
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
629
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
630
+ builder.CreateVector(&op, 1));
631
+
632
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
633
+
634
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
635
+ TFLITE_SCHEMA_VERSION,
636
+ builder.CreateVector(&operator_code, 1),
637
+ builder.CreateVector(&subgraph, 1),
638
+ description,
639
+ builder.CreateVector(buffers.data(), buffers.size()));
640
+
641
+ builder.Finish(model_buffer);
642
+
643
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
644
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
645
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
646
+ std::unique_ptr<tflite::Interpreter> interpreter;
647
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
648
+ state.SkipWithError("failed to create TFLite interpreter");
649
+ return;
650
+ }
651
+ interpreter->SetNumThreads(1);
652
+
653
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
654
+ state.SkipWithError("failed to allocate tensors");
655
+ return;
656
+ }
657
+
658
+ uint16_t* input_data = reinterpret_cast<uint16_t*>(interpreter->tensor(0)->data.data);
659
+ std::generate_n(input_data, batch_size, std::ref(f16rng));
660
+
661
+ for (auto _ : state) {
662
+ if (interpreter->Invoke() != kTfLiteOk) {
663
+ state.SkipWithError("failed to invoke TFLite interpreter");
664
+ return;
665
+ }
666
+ }
667
+
668
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
669
+ if (cpu_frequency != 0) {
670
+ state.counters["cpufreq"] = cpu_frequency;
671
+ }
672
+
673
+ state.counters["elements"] =
674
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
675
+
676
+ const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
677
+ state.counters["bytes"] =
678
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
679
+
680
+ interpreter.reset();
681
+ }
682
+
683
+ void tflite_convert_f32_qs8(benchmark::State& state) {
684
+ const size_t batch_size = state.range(0);
685
+
686
+ std::random_device random_device;
687
+ auto rng = std::mt19937(random_device());
688
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
689
+
690
+ flatbuffers::FlatBufferBuilder builder;
691
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
692
+ CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
693
+
694
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
695
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
696
+ }};
697
+
698
+ const std::array<int32_t, 1> shape{{
699
+ static_cast<int32_t>(batch_size)
700
+ }};
701
+
702
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
703
+ tflite::CreateTensor(builder,
704
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
705
+ tflite::TensorType_FLOAT32),
706
+ tflite::CreateTensor(builder,
707
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
708
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
709
+ tflite::CreateQuantizationParameters(builder,
710
+ 0 /*min*/, 0 /*max*/,
711
+ builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
712
+ builder.CreateVector<int64_t>({1 /* zero point */})))
713
+ }};
714
+
715
+ const std::array<int32_t, 1> op_inputs{{0}};
716
+ const std::array<int32_t, 1> op_outputs{{1}};
717
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
718
+ 0 /* opcode_index */,
719
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
720
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
721
+
722
+ const std::array<int32_t, 1> graph_inputs{{0}};
723
+ const std::array<int32_t, 1> graph_outputs{{1}};
724
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
725
+ builder,
726
+ builder.CreateVector(tensors.data(), tensors.size()),
727
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
728
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
729
+ builder.CreateVector(&op, 1));
730
+
731
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
732
+
733
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
734
+ TFLITE_SCHEMA_VERSION,
735
+ builder.CreateVector(&operator_code, 1),
736
+ builder.CreateVector(&subgraph, 1),
737
+ description,
738
+ builder.CreateVector(buffers.data(), buffers.size()));
739
+
740
+ builder.Finish(model_buffer);
741
+
742
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
743
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
744
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
745
+ std::unique_ptr<tflite::Interpreter> interpreter;
746
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
747
+ state.SkipWithError("failed to create TFLite interpreter");
748
+ return;
749
+ }
750
+ interpreter->SetNumThreads(1);
751
+
752
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
753
+ state.SkipWithError("failed to allocate tensors");
754
+ return;
755
+ }
756
+
757
+ std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
758
+
759
+ for (auto _ : state) {
760
+ if (interpreter->Invoke() != kTfLiteOk) {
761
+ state.SkipWithError("failed to invoke TFLite interpreter");
762
+ return;
763
+ }
764
+ }
765
+
766
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
767
+ if (cpu_frequency != 0) {
768
+ state.counters["cpufreq"] = cpu_frequency;
769
+ }
770
+
771
+ state.counters["elements"] =
772
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
773
+
774
+ const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
775
+ state.counters["bytes"] =
776
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
777
+
778
+ interpreter.reset();
779
+ }
780
+
781
+ void tflite_convert_f32_qu8(benchmark::State& state) {
782
+ const size_t batch_size = state.range(0);
783
+
784
+ std::random_device random_device;
785
+ auto rng = std::mt19937(random_device());
786
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
787
+
788
+ flatbuffers::FlatBufferBuilder builder;
789
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
790
+ CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
791
+
792
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
793
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
794
+ }};
795
+
796
+ const std::array<int32_t, 1> shape{{
797
+ static_cast<int32_t>(batch_size)
798
+ }};
799
+
800
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
801
+ tflite::CreateTensor(builder,
802
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
803
+ tflite::TensorType_FLOAT32),
804
+ tflite::CreateTensor(builder,
805
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
806
+ tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
807
+ tflite::CreateQuantizationParameters(builder,
808
+ 0 /*min*/, 0 /*max*/,
809
+ builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
810
+ builder.CreateVector<int64_t>({127 /* zero point */})))
811
+ }};
812
+
813
+ const std::array<int32_t, 1> op_inputs{{0}};
814
+ const std::array<int32_t, 1> op_outputs{{1}};
815
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
816
+ 0 /* opcode_index */,
817
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
818
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
819
+
820
+ const std::array<int32_t, 1> graph_inputs{{0}};
821
+ const std::array<int32_t, 1> graph_outputs{{1}};
822
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
823
+ builder,
824
+ builder.CreateVector(tensors.data(), tensors.size()),
825
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
826
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
827
+ builder.CreateVector(&op, 1));
828
+
829
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
830
+
831
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
832
+ TFLITE_SCHEMA_VERSION,
833
+ builder.CreateVector(&operator_code, 1),
834
+ builder.CreateVector(&subgraph, 1),
835
+ description,
836
+ builder.CreateVector(buffers.data(), buffers.size()));
837
+
838
+ builder.Finish(model_buffer);
839
+
840
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
841
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
842
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
843
+ std::unique_ptr<tflite::Interpreter> interpreter;
844
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
845
+ state.SkipWithError("failed to create TFLite interpreter");
846
+ return;
847
+ }
848
+ interpreter->SetNumThreads(1);
849
+
850
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
851
+ state.SkipWithError("failed to allocate tensors");
852
+ return;
853
+ }
854
+
855
+ std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
856
+
857
+ for (auto _ : state) {
858
+ if (interpreter->Invoke() != kTfLiteOk) {
859
+ state.SkipWithError("failed to invoke TFLite interpreter");
860
+ return;
861
+ }
862
+ }
863
+
864
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
865
+ if (cpu_frequency != 0) {
866
+ state.counters["cpufreq"] = cpu_frequency;
867
+ }
868
+
869
+ state.counters["elements"] =
870
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
871
+
872
+ const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
873
+ state.counters["bytes"] =
874
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
875
+
876
+ interpreter.reset();
877
+ }
878
+
879
+ void tflite_convert_qs8(benchmark::State& state) {
880
+ const size_t batch_size = state.range(0);
881
+
882
+ std::random_device random_device;
883
+ auto rng = std::mt19937(random_device());
884
+ auto i8rng = std::bind(
885
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
886
+ std::ref(rng));
887
+
888
+ flatbuffers::FlatBufferBuilder builder;
889
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
890
+ CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
891
+
892
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
893
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
894
+ }};
895
+
896
+ const std::array<int32_t, 1> shape{{
897
+ static_cast<int32_t>(batch_size)
898
+ }};
899
+
900
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
901
+ tflite::CreateTensor(builder,
902
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
903
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
904
+ tflite::CreateQuantizationParameters(builder,
905
+ 0 /*min*/, 0 /*max*/,
906
+ builder.CreateVector<float>({0.75f /* scale */}),
907
+ builder.CreateVector<int64_t>({-1 /* zero point */}))),
908
+ tflite::CreateTensor(builder,
909
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
910
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
911
+ tflite::CreateQuantizationParameters(builder,
912
+ 0 /*min*/, 0 /*max*/,
913
+ builder.CreateVector<float>({0.5f /* scale */}),
914
+ builder.CreateVector<int64_t>({1 /* zero point */}))),
915
+ }};
916
+
917
+ const std::array<int32_t, 1> op_inputs{{0}};
918
+ const std::array<int32_t, 1> op_outputs{{1}};
919
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
920
+ 0 /* opcode_index */,
921
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
922
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
923
+
924
+ const std::array<int32_t, 1> graph_inputs{{0}};
925
+ const std::array<int32_t, 1> graph_outputs{{1}};
926
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
927
+ builder,
928
+ builder.CreateVector(tensors.data(), tensors.size()),
929
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
930
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
931
+ builder.CreateVector(&op, 1));
932
+
933
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
934
+
935
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
936
+ TFLITE_SCHEMA_VERSION,
937
+ builder.CreateVector(&operator_code, 1),
938
+ builder.CreateVector(&subgraph, 1),
939
+ description,
940
+ builder.CreateVector(buffers.data(), buffers.size()));
941
+
942
+ builder.Finish(model_buffer);
943
+
944
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
945
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
946
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
947
+ std::unique_ptr<tflite::Interpreter> interpreter;
948
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
949
+ state.SkipWithError("failed to create TFLite interpreter");
950
+ return;
951
+ }
952
+ interpreter->SetNumThreads(1);
953
+
954
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
955
+ state.SkipWithError("failed to allocate tensors");
956
+ return;
957
+ }
958
+
959
+ std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
960
+
961
+ for (auto _ : state) {
962
+ if (interpreter->Invoke() != kTfLiteOk) {
963
+ state.SkipWithError("failed to invoke TFLite interpreter");
964
+ return;
965
+ }
966
+ }
967
+
968
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
969
+ if (cpu_frequency != 0) {
970
+ state.counters["cpufreq"] = cpu_frequency;
971
+ }
972
+
973
+ state.counters["elements"] =
974
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
975
+
976
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
977
+ state.counters["bytes"] =
978
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
979
+
980
+ interpreter.reset();
981
+ }
982
+
983
+ void tflite_convert_qs8_f32(benchmark::State& state) {
984
+ const size_t batch_size = state.range(0);
985
+
986
+ std::random_device random_device;
987
+ auto rng = std::mt19937(random_device());
988
+ auto i8rng = std::bind(
989
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
990
+ std::ref(rng));
991
+
992
+ flatbuffers::FlatBufferBuilder builder;
993
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
994
+ CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
995
+
996
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
997
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
998
+ }};
999
+
1000
+ const std::array<int32_t, 1> shape{{
1001
+ static_cast<int32_t>(batch_size)
1002
+ }};
1003
+
1004
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
1005
+ tflite::CreateTensor(builder,
1006
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1007
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
1008
+ tflite::CreateQuantizationParameters(builder,
1009
+ 0 /*min*/, 0 /*max*/,
1010
+ builder.CreateVector<float>({1.0f / 255.0f /* scale */}),
1011
+ builder.CreateVector<int64_t>({-128 /* zero point */}))),
1012
+ tflite::CreateTensor(builder,
1013
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1014
+ tflite::TensorType_FLOAT32)
1015
+ }};
1016
+
1017
+ const std::array<int32_t, 1> op_inputs{{0}};
1018
+ const std::array<int32_t, 1> op_outputs{{1}};
1019
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
1020
+ 0 /* opcode_index */,
1021
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
1022
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
1023
+
1024
+ const std::array<int32_t, 1> graph_inputs{{0}};
1025
+ const std::array<int32_t, 1> graph_outputs{{1}};
1026
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
1027
+ builder,
1028
+ builder.CreateVector(tensors.data(), tensors.size()),
1029
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
1030
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
1031
+ builder.CreateVector(&op, 1));
1032
+
1033
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
1034
+
1035
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
1036
+ TFLITE_SCHEMA_VERSION,
1037
+ builder.CreateVector(&operator_code, 1),
1038
+ builder.CreateVector(&subgraph, 1),
1039
+ description,
1040
+ builder.CreateVector(buffers.data(), buffers.size()));
1041
+
1042
+ builder.Finish(model_buffer);
1043
+
1044
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
1045
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
1046
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
1047
+ std::unique_ptr<tflite::Interpreter> interpreter;
1048
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
1049
+ state.SkipWithError("failed to create TFLite interpreter");
1050
+ return;
1051
+ }
1052
+ interpreter->SetNumThreads(1);
1053
+
1054
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
1055
+ state.SkipWithError("failed to allocate tensors");
1056
+ return;
1057
+ }
1058
+
1059
+ std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
1060
+
1061
+ for (auto _ : state) {
1062
+ if (interpreter->Invoke() != kTfLiteOk) {
1063
+ state.SkipWithError("failed to invoke TFLite interpreter");
1064
+ return;
1065
+ }
1066
+ }
1067
+
1068
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
1069
+ if (cpu_frequency != 0) {
1070
+ state.counters["cpufreq"] = cpu_frequency;
1071
+ }
1072
+
1073
+ state.counters["elements"] =
1074
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
1075
+
1076
+ const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
1077
+ state.counters["bytes"] =
1078
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
1079
+
1080
+ interpreter.reset();
1081
+ }
1082
+
1083
+ void tflite_convert_qu8(benchmark::State& state) {
1084
+ const size_t batch_size = state.range(0);
1085
+
1086
+ std::random_device random_device;
1087
+ auto rng = std::mt19937(random_device());
1088
+ auto u8rng = std::bind(
1089
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
1090
+ std::ref(rng));
1091
+
1092
+ flatbuffers::FlatBufferBuilder builder;
1093
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
1094
+ CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
1095
+
1096
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
1097
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
1098
+ }};
1099
+
1100
+ const std::array<int32_t, 1> shape{{
1101
+ static_cast<int32_t>(batch_size)
1102
+ }};
1103
+
1104
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
1105
+ tflite::CreateTensor(builder,
1106
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1107
+ tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
1108
+ tflite::CreateQuantizationParameters(builder,
1109
+ 0 /*min*/, 0 /*max*/,
1110
+ builder.CreateVector<float>({0.75f /* scale */}),
1111
+ builder.CreateVector<int64_t>({125 /* zero point */}))),
1112
+ tflite::CreateTensor(builder,
1113
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1114
+ tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
1115
+ tflite::CreateQuantizationParameters(builder,
1116
+ 0 /*min*/, 0 /*max*/,
1117
+ builder.CreateVector<float>({0.5f /* scale */}),
1118
+ builder.CreateVector<int64_t>({130 /* zero point */})))
1119
+ }};
1120
+
1121
+ const std::array<int32_t, 1> op_inputs{{0}};
1122
+ const std::array<int32_t, 1> op_outputs{{1}};
1123
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
1124
+ 0 /* opcode_index */,
1125
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
1126
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
1127
+
1128
+ const std::array<int32_t, 1> graph_inputs{{0}};
1129
+ const std::array<int32_t, 1> graph_outputs{{1}};
1130
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
1131
+ builder,
1132
+ builder.CreateVector(tensors.data(), tensors.size()),
1133
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
1134
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
1135
+ builder.CreateVector(&op, 1));
1136
+
1137
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
1138
+
1139
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
1140
+ TFLITE_SCHEMA_VERSION,
1141
+ builder.CreateVector(&operator_code, 1),
1142
+ builder.CreateVector(&subgraph, 1),
1143
+ description,
1144
+ builder.CreateVector(buffers.data(), buffers.size()));
1145
+
1146
+ builder.Finish(model_buffer);
1147
+
1148
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
1149
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
1150
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
1151
+ std::unique_ptr<tflite::Interpreter> interpreter;
1152
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
1153
+ state.SkipWithError("failed to create TFLite interpreter");
1154
+ return;
1155
+ }
1156
+ interpreter->SetNumThreads(1);
1157
+
1158
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
1159
+ state.SkipWithError("failed to allocate tensors");
1160
+ return;
1161
+ }
1162
+
1163
+ std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
1164
+
1165
+ for (auto _ : state) {
1166
+ if (interpreter->Invoke() != kTfLiteOk) {
1167
+ state.SkipWithError("failed to invoke TFLite interpreter");
1168
+ return;
1169
+ }
1170
+ }
1171
+
1172
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
1173
+ if (cpu_frequency != 0) {
1174
+ state.counters["cpufreq"] = cpu_frequency;
1175
+ }
1176
+
1177
+ state.counters["elements"] =
1178
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
1179
+
1180
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
1181
+ state.counters["bytes"] =
1182
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
1183
+
1184
+ interpreter.reset();
1185
+ }
1186
+
1187
+ void tflite_convert_qu8_f32(benchmark::State& state) {
1188
+ const size_t batch_size = state.range(0);
1189
+
1190
+ std::random_device random_device;
1191
+ auto rng = std::mt19937(random_device());
1192
+ auto u8rng = std::bind(
1193
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
1194
+ std::ref(rng));
1195
+
1196
+ flatbuffers::FlatBufferBuilder builder;
1197
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
1198
+ CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
1199
+
1200
+ std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
1201
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
1202
+ }};
1203
+
1204
+ const std::array<int32_t, 1> shape{{
1205
+ static_cast<int32_t>(batch_size)
1206
+ }};
1207
+
1208
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
1209
+ tflite::CreateTensor(builder,
1210
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1211
+ tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
1212
+ tflite::CreateQuantizationParameters(builder,
1213
+ 0 /*min*/, 0 /*max*/,
1214
+ builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
1215
+ builder.CreateVector<int64_t>({128 /* zero point */}))),
1216
+ tflite::CreateTensor(builder,
1217
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
1218
+ tflite::TensorType_FLOAT32)
1219
+ }};
1220
+
1221
+ const std::array<int32_t, 1> op_inputs{{0}};
1222
+ const std::array<int32_t, 1> op_outputs{{1}};
1223
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
1224
+ 0 /* opcode_index */,
1225
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
1226
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
1227
+
1228
+ const std::array<int32_t, 1> graph_inputs{{0}};
1229
+ const std::array<int32_t, 1> graph_outputs{{1}};
1230
+ flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
1231
+ builder,
1232
+ builder.CreateVector(tensors.data(), tensors.size()),
1233
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
1234
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
1235
+ builder.CreateVector(&op, 1));
1236
+
1237
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
1238
+
1239
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
1240
+ TFLITE_SCHEMA_VERSION,
1241
+ builder.CreateVector(&operator_code, 1),
1242
+ builder.CreateVector(&subgraph, 1),
1243
+ description,
1244
+ builder.CreateVector(buffers.data(), buffers.size()));
1245
+
1246
+ builder.Finish(model_buffer);
1247
+
1248
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
1249
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
1250
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
1251
+ std::unique_ptr<tflite::Interpreter> interpreter;
1252
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
1253
+ state.SkipWithError("failed to create TFLite interpreter");
1254
+ return;
1255
+ }
1256
+ interpreter->SetNumThreads(1);
1257
+
1258
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
1259
+ state.SkipWithError("failed to allocate tensors");
1260
+ return;
1261
+ }
1262
+
1263
+ std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
1264
+
1265
+ for (auto _ : state) {
1266
+ if (interpreter->Invoke() != kTfLiteOk) {
1267
+ state.SkipWithError("failed to invoke TFLite interpreter");
1268
+ return;
1269
+ }
1270
+ }
1271
+
1272
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
1273
+ if (cpu_frequency != 0) {
1274
+ state.counters["cpufreq"] = cpu_frequency;
1275
+ }
1276
+
1277
+ state.counters["elements"] =
1278
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
1279
+
1280
+ const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
1281
+ state.counters["bytes"] =
1282
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
1283
+
1284
+ interpreter.reset();
1285
+ }
1286
+ #endif // BENCHMARK_TENSORFLOW_LITE
1287
+
1288
+ BENCHMARK(xnnpack_convert_f16_f32)
1289
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
1290
+ ->UseRealTime();
1291
+ BENCHMARK(xnnpack_convert_f32_f16)
1292
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint16_t>)
1293
+ ->UseRealTime();
1294
+ BENCHMARK(xnnpack_convert_f32_qs8)
1295
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
1296
+ ->UseRealTime();
1297
+ BENCHMARK(xnnpack_convert_f32_qu8)
1298
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
1299
+ ->UseRealTime();
1300
+ BENCHMARK(xnnpack_convert_qs8)
1301
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
1302
+ ->UseRealTime();
1303
+ BENCHMARK(xnnpack_convert_qs8_f32)
1304
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
1305
+ ->UseRealTime();
1306
+ BENCHMARK(xnnpack_convert_qu8)
1307
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
1308
+ ->UseRealTime();
1309
+ BENCHMARK(xnnpack_convert_qu8_f32)
1310
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
1311
+ ->UseRealTime();
1312
+
1313
+ #ifdef BENCHMARK_TENSORFLOW_LITE
1314
+ BENCHMARK(tflite_convert_f16_f32)
1315
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
1316
+ ->UseRealTime();
1317
+ BENCHMARK(tflite_convert_f32_qs8)
1318
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
1319
+ ->UseRealTime();
1320
+ BENCHMARK(tflite_convert_f32_qu8)
1321
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
1322
+ ->UseRealTime();
1323
+ BENCHMARK(tflite_convert_qs8)
1324
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
1325
+ ->UseRealTime();
1326
+ BENCHMARK(tflite_convert_qs8_f32)
1327
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
1328
+ ->UseRealTime();
1329
+ BENCHMARK(tflite_convert_qu8)
1330
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
1331
+ ->UseRealTime();
1332
+ BENCHMARK(tflite_convert_qu8_f32)
1333
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
1334
+ ->UseRealTime();
1335
+ #endif // BENCHMARK_TENSORFLOW_LITE
1336
+
1337
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
1338
+ BENCHMARK_MAIN();
1339
+ #endif
bench/convolution.cc ADDED
@@ -0,0 +1,1768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #include <algorithm>
10
+ #include <cfloat>
11
+ #include <cmath>
12
+ #include <functional>
13
+ #include <limits>
14
+ #include <memory>
15
+ #include <ostream>
16
+ #include <random>
17
+ #include <string>
18
+ #include <vector>
19
+
20
+ #include <xnnpack.h>
21
+
22
+ #include <benchmark/benchmark.h>
23
+ #include <fp16/fp16.h>
24
+ #ifdef BENCHMARK_TENSORFLOW_LITE
25
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
26
+ #include "tensorflow/lite/interpreter.h"
27
+ #include "tensorflow/lite/kernels/register.h"
28
+ #include "tensorflow/lite/model.h"
29
+ #include "tensorflow/lite/schema/schema_generated.h"
30
+ #include "tensorflow/lite/version.h"
31
+ #endif // BENCHMARK_TENSORFLOW_LITE
32
+ #include "bench/utils.h"
33
+
34
+ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
35
+ const size_t batch_size = state.range(0);
36
+ const size_t input_height = state.range(1);
37
+ const size_t input_width = state.range(2);
38
+ const size_t kernel_height = state.range(3);
39
+ const size_t kernel_width = state.range(4);
40
+ const size_t padding_height = state.range(5);
41
+ const size_t padding_width = state.range(6);
42
+ const size_t subsampling = state.range(7);
43
+ const size_t dilation = state.range(8);
44
+ const size_t groups = state.range(9);
45
+ const size_t group_input_channels = state.range(10);
46
+ const size_t group_output_channels = state.range(11);
47
+
48
+ std::random_device random_device;
49
+ auto rng = std::mt19937(random_device());
50
+ auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
51
+ auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
52
+
53
+ const size_t output_pixel_stride = groups * group_output_channels;
54
+ const size_t input_pixel_stride = groups * group_input_channels;
55
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
56
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
57
+ const size_t padding_left = padding_width / 2;
58
+ const size_t padding_top = padding_height / 2;
59
+ const size_t padding_right = padding_width - padding_left;
60
+ const size_t padding_bottom = padding_height - padding_top;
61
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
62
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
63
+
64
+ std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
65
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
66
+ std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
67
+ std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
68
+ std::vector<int32_t> bias(groups * group_output_channels);
69
+ std::generate(bias.begin(), bias.end(), std::ref(i32rng));
70
+ const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
71
+
72
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
73
+ if (status != xnn_status_success) {
74
+ state.SkipWithError("failed to initialize XNNPACK");
75
+ return;
76
+ }
77
+
78
+ const size_t num_buffers = 1 +
79
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
80
+ sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
81
+ std::vector<uint8_t> output(output_elements * num_buffers);
82
+
83
+ std::vector<xnn_operator_t> convolution_operators(num_buffers);
84
+ for (xnn_operator_t& convolution_op : convolution_operators) {
85
+ status = xnn_create_convolution2d_nhwc_qu8(
86
+ padding_top, padding_right, padding_bottom, padding_left,
87
+ kernel_height, kernel_width,
88
+ subsampling, subsampling,
89
+ dilation, dilation,
90
+ groups, group_input_channels, group_output_channels,
91
+ input_pixel_stride, output_pixel_stride,
92
+ 127, 0.5f,
93
+ 127, 0.5f,
94
+ kernel.data(), bias.data(),
95
+ 127, 0.5f, 0, 255,
96
+ 0 /* flags */, nullptr, nullptr, &convolution_op);
97
+ if (status != xnn_status_success) {
98
+ state.SkipWithError("failed to create QUINT8 Convolution operator");
99
+ return;
100
+ }
101
+ }
102
+
103
+ for (size_t i = 0; i < convolution_operators.size(); i++) {
104
+ status = xnn_reshape_convolution2d_nhwc_qu8(
105
+ convolution_operators[i],
106
+ batch_size, input_height, input_width,
107
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
108
+ nullptr /* thread pool */);
109
+ status = xnn_setup_convolution2d_nhwc_qu8(
110
+ convolution_operators[i],
111
+ input.data(), output.data() + i * output_elements);
112
+ if (status != xnn_status_success) {
113
+ state.SkipWithError("failed to setup QUINT8 Convolution operator");
114
+ return;
115
+ }
116
+ }
117
+
118
+ size_t buffer_index = 0;
119
+ for (auto _ : state) {
120
+ state.PauseTiming();
121
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
122
+ buffer_index = (buffer_index + 1) % num_buffers;
123
+ state.ResumeTiming();
124
+
125
+ status = xnn_run_operator(convolution_operators[buffer_index],
126
+ nullptr /* thread pool */);
127
+ if (status != xnn_status_success) {
128
+ state.SkipWithError("failed to run QUINT8 Convolution operator");
129
+ return;
130
+ }
131
+ }
132
+
133
+ for (xnn_operator_t& convolution_op : convolution_operators) {
134
+ status = xnn_delete_operator(convolution_op);
135
+ if (status != xnn_status_success) {
136
+ state.SkipWithError("failed to delete QUINT8 Convolution operator");
137
+ return;
138
+ }
139
+ convolution_op = nullptr;
140
+ }
141
+
142
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
143
+ if (cpu_frequency != 0) {
144
+ state.counters["cpufreq"] = cpu_frequency;
145
+ }
146
+
147
+ state.counters["OPS"] = benchmark::Counter(
148
+ uint64_t(state.iterations()) * 2 *
149
+ batch_size * output_height * output_width *
150
+ groups * group_input_channels * group_output_channels *
151
+ kernel_height * kernel_width,
152
+ benchmark::Counter::kIsRate);
153
+ }
154
+
155
+ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
156
+ const size_t batch_size = state.range(0);
157
+ const size_t input_height = state.range(1);
158
+ const size_t input_width = state.range(2);
159
+ const size_t kernel_height = state.range(3);
160
+ const size_t kernel_width = state.range(4);
161
+ const size_t padding_height = state.range(5);
162
+ const size_t padding_width = state.range(6);
163
+ const size_t subsampling = state.range(7);
164
+ const size_t dilation = state.range(8);
165
+ const size_t groups = state.range(9);
166
+ const size_t group_input_channels = state.range(10);
167
+ const size_t group_output_channels = state.range(11);
168
+
169
+ std::random_device random_device;
170
+ auto rng = std::mt19937(random_device());
171
+ auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
172
+ auto i8rng = std::bind(
173
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
174
+
175
+ const size_t output_pixel_stride = groups * group_output_channels;
176
+ const size_t input_pixel_stride = groups * group_input_channels;
177
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
178
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
179
+ const size_t padding_left = padding_width / 2;
180
+ const size_t padding_top = padding_height / 2;
181
+ const size_t padding_right = padding_width - padding_left;
182
+ const size_t padding_bottom = padding_height - padding_top;
183
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
184
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
185
+
186
+ std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);
187
+ std::generate(input.begin(), input.end(), std::ref(i8rng));
188
+ std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
189
+ std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
190
+ std::vector<int32_t> bias(groups * group_output_channels);
191
+ std::generate(bias.begin(), bias.end(), std::ref(i32rng));
192
+ const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
193
+
194
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
195
+ if (status != xnn_status_success) {
196
+ state.SkipWithError("failed to initialize XNNPACK");
197
+ return;
198
+ }
199
+
200
+ const size_t num_buffers = 1 +
201
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
202
+ sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);
203
+ std::vector<int8_t> output(output_elements * num_buffers);
204
+
205
+ std::vector<xnn_operator_t> convolution_operators(num_buffers);
206
+ for (xnn_operator_t& convolution_op : convolution_operators) {
207
+ status = xnn_create_convolution2d_nhwc_qs8(
208
+ padding_top, padding_right, padding_bottom, padding_left,
209
+ kernel_height, kernel_width,
210
+ subsampling, subsampling,
211
+ dilation, dilation,
212
+ groups, group_input_channels, group_output_channels,
213
+ input_pixel_stride, output_pixel_stride,
214
+ 127, 0.5f, 0.5f,
215
+ kernel.data(), bias.data(),
216
+ 127, 0.5f, -128, 127,
217
+ 0 /* flags */, nullptr, nullptr, &convolution_op);
218
+ if (status != xnn_status_success) {
219
+ state.SkipWithError("failed to create QINT8 Convolution operator");
220
+ return;
221
+ }
222
+ }
223
+
224
+ for (size_t i = 0; i < convolution_operators.size(); i++) {
225
+ status = xnn_reshape_convolution2d_nhwc_qs8(
226
+ convolution_operators[i],
227
+ batch_size, input_height, input_width,
228
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
229
+ nullptr /* thread pool */);
230
+ status = xnn_setup_convolution2d_nhwc_qs8(
231
+ convolution_operators[i],
232
+ input.data(), output.data() + i * output_elements);
233
+ if (status != xnn_status_success) {
234
+ state.SkipWithError("failed to setup QINT8 Convolution operator");
235
+ return;
236
+ }
237
+ }
238
+
239
+ size_t buffer_index = 0;
240
+ for (auto _ : state) {
241
+ state.PauseTiming();
242
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
243
+ buffer_index = (buffer_index + 1) % num_buffers;
244
+ state.ResumeTiming();
245
+
246
+ status = xnn_run_operator(convolution_operators[buffer_index],
247
+ nullptr /* thread pool */);
248
+ if (status != xnn_status_success) {
249
+ state.SkipWithError("failed to run QINT8 Convolution operator");
250
+ return;
251
+ }
252
+ }
253
+
254
+ for (xnn_operator_t& convolution_op : convolution_operators) {
255
+ status = xnn_delete_operator(convolution_op);
256
+ if (status != xnn_status_success) {
257
+ state.SkipWithError("failed to delete QINT8 Convolution operator");
258
+ return;
259
+ }
260
+ convolution_op = nullptr;
261
+ }
262
+
263
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
264
+ if (cpu_frequency != 0) {
265
+ state.counters["cpufreq"] = cpu_frequency;
266
+ }
267
+
268
+ state.counters["OPS"] = benchmark::Counter(
269
+ uint64_t(state.iterations()) * 2 *
270
+ batch_size * output_height * output_width *
271
+ groups * group_input_channels * group_output_channels *
272
+ kernel_height * kernel_width,
273
+ benchmark::Counter::kIsRate);
274
+ }
275
+
276
+ void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
277
+ const size_t batch_size = state.range(0);
278
+ const size_t input_height = state.range(1);
279
+ const size_t input_width = state.range(2);
280
+ const size_t kernel_height = state.range(3);
281
+ const size_t kernel_width = state.range(4);
282
+ const size_t padding_height = state.range(5);
283
+ const size_t padding_width = state.range(6);
284
+ const size_t subsampling = state.range(7);
285
+ const size_t dilation = state.range(8);
286
+ const size_t groups = state.range(9);
287
+ const size_t group_input_channels = state.range(10);
288
+ const size_t group_output_channels = state.range(11);
289
+
290
+ std::random_device random_device;
291
+ auto rng = std::mt19937(random_device());
292
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
293
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
294
+
295
+ const size_t output_pixel_stride = groups * group_output_channels;
296
+ const size_t input_pixel_stride = groups * group_input_channels;
297
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
298
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
299
+ const size_t padding_left = padding_width / 2;
300
+ const size_t padding_top = padding_height / 2;
301
+ const size_t padding_right = padding_width - padding_left;
302
+ const size_t padding_bottom = padding_height - padding_top;
303
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
304
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
305
+
306
+ std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
307
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
308
+ std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
309
+ std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
310
+ std::vector<uint16_t> bias(groups * group_output_channels);
311
+ std::generate(bias.begin(), bias.end(), std::ref(f16rng));
312
+ const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
313
+
314
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
315
+ if (status != xnn_status_success) {
316
+ state.SkipWithError("failed to initialize XNNPACK");
317
+ return;
318
+ }
319
+
320
+ const size_t num_buffers = 1 +
321
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
322
+ sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
323
+ std::vector<uint16_t> output(output_elements * num_buffers);
324
+
325
+ std::vector<xnn_operator_t> convolution_operators(num_buffers);
326
+ for (xnn_operator_t& convolution_op : convolution_operators) {
327
+ status = xnn_create_convolution2d_nhwc_f16(
328
+ padding_top, padding_right, padding_bottom, padding_left,
329
+ kernel_height, kernel_width,
330
+ subsampling, subsampling,
331
+ dilation, dilation,
332
+ groups, group_input_channels, group_output_channels,
333
+ input_pixel_stride, output_pixel_stride,
334
+ kernel.data(), bias.data(),
335
+ -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
336
+ 0 /* flags */, nullptr, nullptr, &convolution_op);
337
+ if (status != xnn_status_success) {
338
+ state.SkipWithError("failed to create FP16 Convolution operator");
339
+ return;
340
+ }
341
+ }
342
+
343
+ for (size_t i = 0; i < convolution_operators.size(); i++) {
344
+ status = xnn_reshape_convolution2d_nhwc_f16(
345
+ convolution_operators[i],
346
+ batch_size, input_height, input_width,
347
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
348
+ nullptr /* thread pool */);
349
+ status = xnn_setup_convolution2d_nhwc_f16(
350
+ convolution_operators[i],
351
+ input.data(), output.data() + i * output_elements);
352
+ if (status != xnn_status_success) {
353
+ state.SkipWithError("failed to setup FP16 Convolution operator");
354
+ return;
355
+ }
356
+ }
357
+
358
+ size_t buffer_index = 0;
359
+ for (auto _ : state) {
360
+ state.PauseTiming();
361
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
362
+ buffer_index = (buffer_index + 1) % num_buffers;
363
+ state.ResumeTiming();
364
+
365
+ status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
366
+ if (status != xnn_status_success) {
367
+ state.SkipWithError("failed to run FP16 Convolution operator");
368
+ return;
369
+ }
370
+ }
371
+
372
+ for (xnn_operator_t& convolution_op : convolution_operators) {
373
+ status = xnn_delete_operator(convolution_op);
374
+ if (status != xnn_status_success) {
375
+ state.SkipWithError("failed to delete FP16 Convolution operator");
376
+ return;
377
+ }
378
+ convolution_op = nullptr;
379
+ }
380
+
381
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
382
+ if (cpu_frequency != 0) {
383
+ state.counters["cpufreq"] = cpu_frequency;
384
+ }
385
+
386
+ state.counters["FLOPS"] = benchmark::Counter(
387
+ uint64_t(state.iterations()) * 2 *
388
+ batch_size * output_height * output_width *
389
+ groups * group_input_channels * group_output_channels *
390
+ kernel_height * kernel_width,
391
+ benchmark::Counter::kIsRate);
392
+ }
393
+
394
+ void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
395
+ const size_t batch_size = state.range(0);
396
+ const size_t input_height = state.range(1);
397
+ const size_t input_width = state.range(2);
398
+ const size_t kernel_height = state.range(3);
399
+ const size_t kernel_width = state.range(4);
400
+ const size_t padding_height = state.range(5);
401
+ const size_t padding_width = state.range(6);
402
+ const size_t subsampling = state.range(7);
403
+ const size_t dilation = state.range(8);
404
+ const size_t groups = state.range(9);
405
+ const size_t group_input_channels = state.range(10);
406
+ const size_t group_output_channels = state.range(11);
407
+
408
+ std::random_device random_device;
409
+ auto rng = std::mt19937(random_device());
410
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
411
+
412
+ const size_t output_pixel_stride = groups * group_output_channels;
413
+ const size_t input_pixel_stride = groups * group_input_channels;
414
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
415
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
416
+ const size_t padding_left = padding_width / 2;
417
+ const size_t padding_top = padding_height / 2;
418
+ const size_t padding_right = padding_width - padding_left;
419
+ const size_t padding_bottom = padding_height - padding_top;
420
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
421
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
422
+
423
+ std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
424
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
425
+ std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
426
+ std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
427
+ std::vector<float> bias(groups * group_output_channels);
428
+ std::generate(bias.begin(), bias.end(), std::ref(f32rng));
429
+ const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
430
+
431
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
432
+ if (status != xnn_status_success) {
433
+ state.SkipWithError("failed to initialize XNNPACK");
434
+ return;
435
+ }
436
+
437
+ const size_t num_buffers = 1 +
438
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
439
+ sizeof(float) * (kernel.size() + bias.size() + output_elements));
440
+ std::vector<float> output(output_elements * num_buffers);
441
+
442
+ std::vector<xnn_operator_t> convolution_operators(num_buffers);
443
+ for (xnn_operator_t& convolution_op : convolution_operators) {
444
+ status = xnn_create_convolution2d_nhwc_f32(
445
+ padding_top, padding_right, padding_bottom, padding_left,
446
+ kernel_height, kernel_width,
447
+ subsampling, subsampling,
448
+ dilation, dilation,
449
+ groups, group_input_channels, group_output_channels,
450
+ input_pixel_stride, output_pixel_stride,
451
+ kernel.data(), bias.data(),
452
+ -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
453
+ 0 /* flags */, nullptr, nullptr, &convolution_op);
454
+ if (status != xnn_status_success) {
455
+ state.SkipWithError("failed to create FP32 Convolution operator");
456
+ return;
457
+ }
458
+ }
459
+
460
+ for (size_t i = 0; i < convolution_operators.size(); i++) {
461
+ status = xnn_reshape_convolution2d_nhwc_f32(
462
+ convolution_operators[i],
463
+ batch_size, input_height, input_width,
464
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
465
+ nullptr /* thread pool */);
466
+ status = xnn_setup_convolution2d_nhwc_f32(
467
+ convolution_operators[i],
468
+ input.data(), output.data() + i * output_elements);
469
+ if (status != xnn_status_success) {
470
+ state.SkipWithError("failed to setup FP32 Convolution operator");
471
+ return;
472
+ }
473
+ }
474
+
475
+ size_t buffer_index = 0;
476
+ for (auto _ : state) {
477
+ state.PauseTiming();
478
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
479
+ buffer_index = (buffer_index + 1) % num_buffers;
480
+ state.ResumeTiming();
481
+
482
+ status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
483
+ if (status != xnn_status_success) {
484
+ state.SkipWithError("failed to run FP32 Convolution operator");
485
+ return;
486
+ }
487
+ }
488
+
489
+ for (xnn_operator_t& convolution_op : convolution_operators) {
490
+ status = xnn_delete_operator(convolution_op);
491
+ if (status != xnn_status_success) {
492
+ state.SkipWithError("failed to delete FP32 Convolution operator");
493
+ return;
494
+ }
495
+ convolution_op = nullptr;
496
+ }
497
+
498
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
499
+ if (cpu_frequency != 0) {
500
+ state.counters["cpufreq"] = cpu_frequency;
501
+ }
502
+
503
+ state.counters["FLOPS"] = benchmark::Counter(
504
+ uint64_t(state.iterations()) * 2 *
505
+ batch_size * output_height * output_width *
506
+ groups * group_input_channels * group_output_channels *
507
+ kernel_height * kernel_width,
508
+ benchmark::Counter::kIsRate);
509
+ }
510
+
511
+ #ifdef BENCHMARK_TENSORFLOW_LITE
512
+ void tflite_convolution_f32(benchmark::State& state, const char* net) {
513
+ const size_t batch_size = state.range(0);
514
+ const size_t input_height = state.range(1);
515
+ const size_t input_width = state.range(2);
516
+ const size_t kernel_height = state.range(3);
517
+ const size_t kernel_width = state.range(4);
518
+ const size_t padding_height = state.range(5);
519
+ const size_t padding_width = state.range(6);
520
+ const size_t subsampling = state.range(7);
521
+ const size_t dilation = state.range(8);
522
+ const size_t groups = state.range(9);
523
+ const size_t group_input_channels = state.range(10);
524
+ const size_t group_output_channels = state.range(11);
525
+
526
+ bool is_depthwise = false;
527
+ if (groups != 1) {
528
+ if (group_input_channels == 1) {
529
+ is_depthwise = true;
530
+ } else {
531
+ state.SkipWithError("grouped convolution is not supported");
532
+ return;
533
+ }
534
+ }
535
+
536
+ std::random_device random_device;
537
+ auto rng = std::mt19937(random_device());
538
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
539
+
540
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
541
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
542
+
543
+ tflite::Padding padding = tflite::Padding_VALID;
544
+ if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
545
+ padding = tflite::Padding_SAME;
546
+ } else if (padding_width == 0 && padding_height == 0) {
547
+ padding = tflite::Padding_VALID;
548
+ } else {
549
+ state.SkipWithError("unsupported padding");
550
+ return;
551
+ }
552
+
553
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
554
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
555
+
556
+ std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
557
+ std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
558
+ std::vector<float> bias(groups * group_output_channels);
559
+ std::generate(bias.begin(), bias.end(), std::ref(f32rng));
560
+
561
+ flatbuffers::FlatBufferBuilder builder;
562
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
563
+ CreateOperatorCode(
564
+ builder,
565
+ is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
566
+ 0);
567
+
568
+ flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
569
+ builder,
570
+ padding,
571
+ static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
572
+ tflite::ActivationFunctionType_NONE,
573
+ static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
574
+
575
+ flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
576
+ builder,
577
+ padding,
578
+ static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
579
+ static_cast<int32_t>(group_output_channels),
580
+ tflite::ActivationFunctionType_NONE,
581
+ static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
582
+
583
+ flatbuffers::Offset<tflite::Buffer> buffers[3] = {
584
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
585
+ tflite::CreateBuffer(builder, builder.CreateVector(
586
+ reinterpret_cast<const uint8_t*>(kernel.data()),
587
+ sizeof(float) * kernel.size())),
588
+ tflite::CreateBuffer(builder, builder.CreateVector(
589
+ reinterpret_cast<const uint8_t*>(bias.data()),
590
+ sizeof(float) * bias.size())),
591
+ };
592
+
593
+ const int32_t input_shape[4] = {
594
+ static_cast<int32_t>(batch_size),
595
+ static_cast<int32_t>(input_height),
596
+ static_cast<int32_t>(input_width),
597
+ static_cast<int32_t>(groups * group_input_channels)
598
+ };
599
+ const int32_t output_shape[4] = {
600
+ static_cast<int32_t>(batch_size),
601
+ static_cast<int32_t>(output_height),
602
+ static_cast<int32_t>(output_width),
603
+ static_cast<int32_t>(groups * group_output_channels)
604
+ };
605
+ const int32_t filter_shape[4] = {
606
+ static_cast<int32_t>(group_output_channels),
607
+ static_cast<int32_t>(kernel_height),
608
+ static_cast<int32_t>(kernel_width),
609
+ static_cast<int32_t>(groups * group_input_channels)
610
+ };
611
+ const int32_t bias_shape[1] = {
612
+ static_cast<int32_t>(groups * group_output_channels)
613
+ };
614
+
615
+ flatbuffers::Offset<tflite::Tensor> tensors[4] = {
616
+ tflite::CreateTensor(builder,
617
+ builder.CreateVector<int32_t>(input_shape, 4),
618
+ tflite::TensorType_FLOAT32,
619
+ 0 /* buffer id */,
620
+ builder.CreateString("input")),
621
+ tflite::CreateTensor(builder,
622
+ builder.CreateVector<int32_t>(filter_shape, 4),
623
+ tflite::TensorType_FLOAT32,
624
+ 1 /* buffer id */,
625
+ builder.CreateString("filter")),
626
+ tflite::CreateTensor(builder,
627
+ builder.CreateVector<int32_t>(bias_shape, 1),
628
+ tflite::TensorType_FLOAT32,
629
+ 2 /* buffer id */,
630
+ builder.CreateString("bias")),
631
+ tflite::CreateTensor(builder,
632
+ builder.CreateVector<int32_t>(output_shape, 4),
633
+ tflite::TensorType_FLOAT32,
634
+ 0 /* buffer id */,
635
+ builder.CreateString("output")),
636
+ };
637
+
638
+ const int32_t op_inputs[3] = { 0, 1, 2 };
639
+ const int32_t op_outputs[1] = { 3 };
640
+ flatbuffers::Offset<tflite::Operator> op = CreateOperator(
641
+ builder,
642
+ 0 /* opcode_index */,
643
+ builder.CreateVector<int32_t>(op_inputs, 3),
644
+ builder.CreateVector<int32_t>(op_outputs, 1),
645
+ is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
646
+ is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
647
+ /*custom_options */ 0,
648
+ tflite::CustomOptionsFormat_FLEXBUFFERS);
649
+
650
+ const int32_t graph_inputs[1] = { 0 };
651
+ const int32_t graph_outputs[1] = { 3 };
652
+ flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
653
+ builder,
654
+ builder.CreateVector(tensors, 4),
655
+ builder.CreateVector<int32_t>(graph_inputs, 1),
656
+ builder.CreateVector<int32_t>(graph_outputs, 1),
657
+ builder.CreateVector(&op, 1),
658
+ builder.CreateString("Conv2D subgraph"));
659
+
660
+ flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
661
+
662
+ flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
663
+ TFLITE_SCHEMA_VERSION,
664
+ builder.CreateVector(&operator_code, 1),
665
+ builder.CreateVector(&subgraph, 1),
666
+ description,
667
+ builder.CreateVector(buffers, 3));
668
+
669
+ builder.Finish(model_buffer);
670
+
671
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
672
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
673
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
674
+ std::unique_ptr<tflite::Interpreter> interpreter;
675
+ if (interpreterBuilder(&interpreter) != kTfLiteOk) {
676
+ state.SkipWithError("failed to create TFLite interpreter");
677
+ return;
678
+ }
679
+ if (interpreter == nullptr) {
680
+ state.SkipWithError("TFLite interpreter is null");
681
+ return;
682
+ }
683
+ interpreter->SetNumThreads(1);
684
+
685
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
686
+ state.SkipWithError("failed to allocate tensors");
687
+ return;
688
+ }
689
+
690
+ std::generate(
691
+ interpreter->typed_tensor<float>(0),
692
+ interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
693
+ std::ref(f32rng));
694
+
695
+ for (auto _ : state) {
696
+ state.PauseTiming();
697
+ benchmark::utils::WipeCache();
698
+ benchmark::utils::PrefetchToL1(
699
+ interpreter->typed_tensor<float>(0),
700
+ batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
701
+ state.ResumeTiming();
702
+
703
+ if (interpreter->Invoke() != kTfLiteOk) {
704
+ state.SkipWithError("failed to invoke TFLite interpreter");
705
+ return;
706
+ }
707
+ }
708
+
709
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
710
+ if (cpu_frequency != 0) {
711
+ state.counters["cpufreq"] = cpu_frequency;
712
+ }
713
+
714
+ state.counters["FLOPS"] = benchmark::Counter(
715
+ uint64_t(state.iterations()) * 2 *
716
+ batch_size * output_height * output_width *
717
+ groups * group_input_channels * group_output_channels *
718
+ kernel_height * kernel_width,
719
+ benchmark::Counter::kIsRate);
720
+
721
+ interpreter.reset();
722
+ }
723
+ #endif // BENCHMARK_TENSORFLOW_LITE
724
+
725
+ // ShuffleNet v1 with 1 group.
726
+ static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
727
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
728
+
729
+ /*************************** Conv 1 **************************/
730
+ /* N H W KH KW PH PW S D G GCin GCout */
731
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
732
+ /******************* Stage 2: stride-2 unit ******************/
733
+ /* N H W KH KW PH PW S D G GCin GCout */
734
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 36});
735
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 36, 1, 1});
736
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 120});
737
+ /******************* Stage 2: stride-1 units *****************/
738
+ /* N H W KH KW PH PW S D G GCin GCout */
739
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 36});
740
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 36, 1, 1});
741
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 144});
742
+ /******************* Stage 3: stride-2 unit ******************/
743
+ /* N H W KH KW PH PW S D G GCin GCout */
744
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 72});
745
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 72, 1, 1});
746
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 144});
747
+ /******************* Stage 3: stride-1 units *****************/
748
+ /* N H W KH KW PH PW S D G GCin GCout */
749
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 72});
750
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 72, 1, 1});
751
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 288});
752
+ /******************* Stage 4: stride-2 unit ******************/
753
+ /* N H W KH KW PH PW S D G GCin GCout */
754
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 144});
755
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 144, 1, 1});
756
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 288});
757
+ /******************* Stage 4: stride-1 units *****************/
758
+ /* N H W KH KW PH PW S D G GCin GCout */
759
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 144});
760
+ b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 144, 1, 1});
761
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 576});
762
+ }
763
+
764
+ // ShuffleNet v1 with 2 groups.
765
+ static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
766
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
767
+
768
+ /*************************** Conv 1 **************************/
769
+ /* N H W KH KW PH PW S D G GCin GCout */
770
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
771
+ /******************* Stage 2: stride-2 unit ******************/
772
+ /* N H W KH KW PH PW S D G GCin GCout */
773
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 50});
774
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 50, 1, 1});
775
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 88});
776
+ /******************* Stage 2: stride-1 units *****************/
777
+ /* N H W KH KW PH PW S D G GCin GCout */
778
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 25});
779
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 50, 1, 1});
780
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 100});
781
+ /******************* Stage 3: stride-2 unit ******************/
782
+ /* N H W KH KW PH PW S D G GCin GCout */
783
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 50});
784
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 100, 1, 1});
785
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 100});
786
+ /******************* Stage 3: stride-1 units *****************/
787
+ /* N H W KH KW PH PW S D G GCin GCout */
788
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 50});
789
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 100, 1, 1});
790
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 200});
791
+ /******************* Stage 4: stride-2 unit ******************/
792
+ /* N H W KH KW PH PW S D G GCin GCout */
793
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 100});
794
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 200, 1, 1});
795
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 200});
796
+ /******************* Stage 4: stride-1 units *****************/
797
+ /* N H W KH KW PH PW S D G GCin GCout */
798
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 400, 100});
799
+ b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 200, 1, 1});
800
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 400});
801
+ }
802
+
803
+ // ShuffleNet v1 with 3 groups.
804
+ static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
805
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
806
+
807
+ /*************************** Conv 1 **************************/
808
+ /* N H W KH KW PH PW S D G GCin GCout */
809
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
810
+ /******************* Stage 2: stride-2 unit ******************/
811
+ /* N H W KH KW PH PW S D G GCin GCout */
812
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 60});
813
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 60, 1, 1});
814
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 72});
815
+ /******************* Stage 2: stride-1 units *****************/
816
+ /* N H W KH KW PH PW S D G GCin GCout */
817
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 20});
818
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 60, 1, 1});
819
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 80});
820
+ /******************* Stage 3: stride-2 unit ******************/
821
+ /* N H W KH KW PH PW S D G GCin GCout */
822
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 40});
823
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 120, 1, 1});
824
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 80});
825
+ /******************* Stage 3: stride-1 units *****************/
826
+ /* N H W KH KW PH PW S D G GCin GCout */
827
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 40});
828
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 120, 1, 1});
829
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 160});
830
+ /******************* Stage 4: stride-2 unit ******************/
831
+ /* N H W KH KW PH PW S D G GCin GCout */
832
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 80});
833
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 240, 1, 1});
834
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 160});
835
+ /******************* Stage 4: stride-1 units *****************/
836
+ /* N H W KH KW PH PW S D G GCin GCout */
837
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 320, 80});
838
+ b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 240, 1, 1});
839
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 320});
840
+ }
841
+
842
+ // ShuffleNet v1 with 4 groups.
843
+ static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
844
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
845
+
846
+ /*************************** Conv 1 **************************/
847
+ /* N H W KH KW PH PW S D G GCin GCout */
848
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
849
+ /******************* Stage 2: stride-2 unit ******************/
850
+ /* N H W KH KW PH PW S D G GCin GCout */
851
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 68});
852
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 68, 1, 1});
853
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 62});
854
+ /******************* Stage 2: stride-1 units *****************/
855
+ /* N H W KH KW PH PW S D G GCin GCout */
856
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 17});
857
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 68, 1, 1});
858
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 68});
859
+ /******************* Stage 3: stride-2 unit ******************/
860
+ /* N H W KH KW PH PW S D G GCin GCout */
861
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 34});
862
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 136, 1, 1});
863
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 68});
864
+ /******************* Stage 3: stride-1 units *****************/
865
+ /* N H W KH KW PH PW S D G GCin GCout */
866
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 34});
867
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 136, 1, 1});
868
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 136});
869
+ /******************* Stage 4: stride-2 unit ******************/
870
+ /* N H W KH KW PH PW S D G GCin GCout */
871
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 68});
872
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 272, 1, 1});
873
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 136});
874
+ /******************* Stage 4: stride-1 units *****************/
875
+ /* N H W KH KW PH PW S D G GCin GCout */
876
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 272, 68});
877
+ b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 272, 1, 1});
878
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 272});
879
+ }
880
+
881
+ // ShuffleNet v1 with 8 groups.
882
+ static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
883
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
884
+
885
+ /*************************** Conv 1 **************************/
886
+ /* N H W KH KW PH PW S D G GCin GCout */
887
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
888
+ /******************* Stage 2: stride-2 unit ******************/
889
+ /* N H W KH KW PH PW S D G GCin GCout */
890
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 96});
891
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 96, 1, 1});
892
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 45});
893
+ /******************* Stage 2: stride-1 units *****************/
894
+ /* N H W KH KW PH PW S D G GCin GCout */
895
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 12});
896
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 96, 1, 1});
897
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 48});
898
+ /******************* Stage 3: stride-2 unit ******************/
899
+ /* N H W KH KW PH PW S D G GCin GCout */
900
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 24});
901
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
902
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 48});
903
+ /******************* Stage 3: stride-1 units *****************/
904
+ /* N H W KH KW PH PW S D G GCin GCout */
905
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 24});
906
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 192, 1, 1});
907
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 96});
908
+ /******************* Stage 4: stride-2 unit ******************/
909
+ /* N H W KH KW PH PW S D G GCin GCout */
910
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 48});
911
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 384, 1, 1});
912
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 96});
913
+ /******************* Stage 4: stride-1 units *****************/
914
+ /* N H W KH KW PH PW S D G GCin GCout */
915
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 192, 48});
916
+ b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 384, 1, 1});
917
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 192});
918
+ }
919
+
920
+ // ShuffleNet v2 (0.5X scale)
921
+ static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
922
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
923
+
924
+ /*************************** Conv 1 **************************/
925
+ /* N H W KH KW PH PW S D G GCin GCout */
926
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
927
+ /************************** Stage 2 **************************/
928
+ /* N H W KH KW PH PW S D G GCin GCout */
929
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
930
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 24});
931
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 24});
932
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 24, 1, 1});
933
+ /************************** Stage 3 **************************/
934
+ /* N H W KH KW PH PW S D G GCin GCout */
935
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 48, 1, 1});
936
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 48});
937
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 48, 48});
938
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 48, 1, 1});
939
+ /************************** Stage 4 **************************/
940
+ /* N H W KH KW PH PW S D G GCin GCout */
941
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 96, 1, 1});
942
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 96});
943
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 96});
944
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 96, 1, 1});
945
+ /*************************** Conv 5 **************************/
946
+ /* N H W KH KW PH PW S D G GCin GCout */
947
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 192, 1024});
948
+ }
949
+
950
+ // ShuffleNet v2 (1.0X scale)
951
+ static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
952
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
953
+
954
+ /*************************** Conv 1 **************************/
955
+ /* N H W KH KW PH PW S D G GCin GCout */
956
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
957
+ /************************** Stage 2 **************************/
958
+ /* N H W KH KW PH PW S D G GCin GCout */
959
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
960
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 58});
961
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 58});
962
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 58, 1, 1});
963
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 58, 58});
964
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 58, 1, 1});
965
+ /************************** Stage 3 **************************/
966
+ /* N H W KH KW PH PW S D G GCin GCout */
967
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 116, 1, 1});
968
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 116, 116});
969
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 116, 116});
970
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 116, 1, 1});
971
+ /************************** Stage 4 **************************/
972
+ /* N H W KH KW PH PW S D G GCin GCout */
973
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 232, 1, 1});
974
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 232, 232});
975
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 232, 232});
976
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 232, 1, 1});
977
+ /*************************** Conv 5 **************************/
978
+ /* N H W KH KW PH PW S D G GCin GCout */
979
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 464, 1024});
980
+ }
981
+
982
+ // ShuffleNet v2 (1.5X scale)
983
+ static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
984
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
985
+
986
+ /*************************** Conv 1 **************************/
987
+ /* N H W KH KW PH PW S D G GCin GCout */
988
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
989
+ /************************** Stage 2 **************************/
990
+ /* N H W KH KW PH PW S D G GCin GCout */
991
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
992
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
993
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 88});
994
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 88, 1, 1});
995
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 88});
996
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
997
+ /************************** Stage 3 **************************/
998
+ /* N H W KH KW PH PW S D G GCin GCout */
999
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 176, 1, 1});
1000
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 176, 176});
1001
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 176, 176});
1002
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 176, 1, 1});
1003
+ /************************** Stage 4 **************************/
1004
+ /* N H W KH KW PH PW S D G GCin GCout */
1005
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 352, 1, 1});
1006
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 352, 352});
1007
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 352, 352});
1008
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 352, 1, 1});
1009
+ /*************************** Conv 5 **************************/
1010
+ /* N H W KH KW PH PW S D G GCin GCout */
1011
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 704, 1024});
1012
+ }
1013
+
1014
+ // ShuffleNet v2 (2.0X scale)
1015
+ static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
1016
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1017
+
1018
+ /*************************** Conv 1 **************************/
1019
+ /* N H W KH KW PH PW S D G GCin GCout */
1020
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1021
+ /************************** Stage 2 **************************/
1022
+ /* N H W KH KW PH PW S D G GCin GCout */
1023
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
1024
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 122});
1025
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 122});
1026
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 122, 1, 1});
1027
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 122, 122});
1028
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 122, 1, 1});
1029
+ /************************** Stage 3 **************************/
1030
+ /* N H W KH KW PH PW S D G GCin GCout */
1031
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 244, 1, 1});
1032
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 244, 244});
1033
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 244, 244});
1034
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 244, 1, 1});
1035
+ /************************** Stage 4 **************************/
1036
+ /* N H W KH KW PH PW S D G GCin GCout */
1037
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 488, 1, 1});
1038
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 488, 488});
1039
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 488, 488});
1040
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 488, 1, 1});
1041
+ /*************************** Conv 5 **************************/
1042
+ /* N H W KH KW PH PW S D G GCin GCout */
1043
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 976, 2048});
1044
+ }
1045
+
1046
+ static void MobileNetV1(benchmark::internal::Benchmark* b) {
1047
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1048
+
1049
+ /* N H W KH KW PH PW S D G GCin GCout */
1050
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
1051
+ b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
1052
+ b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 64});
1053
+ b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
1054
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 128});
1055
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 128, 1, 1});
1056
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 128, 128});
1057
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 128, 1, 1});
1058
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 256});
1059
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 256, 1, 1});
1060
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 256, 256});
1061
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 256, 1, 1});
1062
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 512});
1063
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 512, 1, 1});
1064
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1065
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 512, 1, 1});
1066
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 1024});
1067
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1024, 1, 1});
1068
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 1024, 1024});
1069
+ }
1070
+
1071
+ static void MobileNetV2(benchmark::internal::Benchmark* b) {
1072
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1073
+
1074
+ /* N H W KH KW PH PW S D G GCin GCout */
1075
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
1076
+
1077
+ /************************ Bottleneck 1 ***********************/
1078
+ /* N H W KH KW PH PW S D G GCin GCout */
1079
+ b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
1080
+ b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 16});
1081
+
1082
+ /************************ Bottleneck 2 ***********************/
1083
+ /* N H W KH KW PH PW S D G GCin GCout */
1084
+ b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 96});
1085
+ b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 96, 1, 1});
1086
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 96, 24});
1087
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
1088
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 144, 1, 1});
1089
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 144, 24});
1090
+
1091
+ /************************ Bottleneck 3 ***********************/
1092
+ /* N H W KH KW PH PW S D G GCin GCout */
1093
+ //b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
1094
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 144, 1, 1});
1095
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 32});
1096
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1097
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
1098
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1099
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1100
+ //b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
1101
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1102
+
1103
+ /************************ Bottleneck 4 ***********************/
1104
+ /* N H W KH KW PH PW S D G GCin GCout */
1105
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1106
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
1107
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 192, 64});
1108
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1109
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1110
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1111
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1112
+ //b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1113
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1114
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1115
+ //b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1116
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1117
+
1118
+ /************************ Bottleneck 5 ***********************/
1119
+ /* N H W KH KW PH PW S D G GCin GCout */
1120
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1121
+ //b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1122
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 96});
1123
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1124
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
1125
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1126
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1127
+ //b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
1128
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1129
+
1130
+ /************************ Bottleneck 6 ***********************/
1131
+ /* N H W KH KW PH PW S D G GCin GCout */
1132
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1133
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 576, 1, 1});
1134
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 160});
1135
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1136
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1137
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1138
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1139
+ //b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1140
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1141
+
1142
+ /************************ Bottleneck 7 ***********************/
1143
+ /* N H W KH KW PH PW S D G GCin GCout */
1144
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1145
+ //b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1146
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 320});
1147
+
1148
+ /******************** Pre-pooling Conv2D *********************/
1149
+ /* N H W KH KW PH PW S D G GCin GCout */
1150
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 320, 1280});
1151
+ /******************** Post-pooling Conv2D ********************/
1152
+ /* N H W KH KW PH PW S D G GCin GCout */
1153
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1000});
1154
+ }
1155
+
1156
+ static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
1157
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1158
+
1159
+ /*********************** Initial Stage ***********************/
1160
+ /* N H W KH KW PH PW S D G GCin GCout */
1161
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
1162
+ /*********************** Bottleneck 1 ************************/
1163
+ /* N H W KH KW PH PW S D G GCin GCout */
1164
+ b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 16, 1, 1});
1165
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 16, 8});
1166
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 8, 16});
1167
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 16});
1168
+ /*********************** Bottleneck 2 ************************/
1169
+ /* N H W KH KW PH PW S D G GCin GCout */
1170
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 72});
1171
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 72, 1, 1});
1172
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1173
+ /*********************** Bottleneck 3 ************************/
1174
+ /* N H W KH KW PH PW S D G GCin GCout */
1175
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
1176
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
1177
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 24});
1178
+ /*********************** Bottleneck 4 ************************/
1179
+ /* N H W KH KW PH PW S D G GCin GCout */
1180
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 96});
1181
+ b->Args({1, 28, 28, 5, 5, 4, 4, 2, 1, 96, 1, 1});
1182
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 96, 24});
1183
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 96});
1184
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 40});
1185
+ /*********************** Bottleneck 5 ************************/
1186
+ /* N H W KH KW PH PW S D G GCin GCout */
1187
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1188
+ b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
1189
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
1190
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
1191
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
1192
+ /*********************** Bottleneck 6 ************************/
1193
+ /* N H W KH KW PH PW S D G GCin GCout */
1194
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1195
+ //b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
1196
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
1197
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
1198
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
1199
+ /*********************** Bottleneck 7 ************************/
1200
+ /* N H W KH KW PH PW S D G GCin GCout */
1201
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1202
+ b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1203
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1204
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1205
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 120, 48});
1206
+ /*********************** Bottleneck 8 ************************/
1207
+ /* N H W KH KW PH PW S D G GCin GCout */
1208
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 144});
1209
+ b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 144, 1, 1});
1210
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 40});
1211
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 40, 144});
1212
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 144, 48});
1213
+ /*********************** Bottleneck 9 ************************/
1214
+ /* N H W KH KW PH PW S D G GCin GCout */
1215
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 288});
1216
+ b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 288, 1, 1});
1217
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 288, 72});
1218
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 288});
1219
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 288, 96});
1220
+ /*********************** Bottleneck 10 ***********************/
1221
+ /* N H W KH KW PH PW S D G GCin GCout */
1222
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1223
+ b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
1224
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
1225
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
1226
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1227
+ /*********************** Bottleneck 11 ***********************/
1228
+ /* N H W KH KW PH PW S D G GCin GCout */
1229
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1230
+ //b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
1231
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
1232
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
1233
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1234
+ /************************ Last Stage ************************/
1235
+ /* N H W KH KW PH PW S D G GCin GCout */
1236
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1237
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 1024});
1238
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1024, 1001});
1239
+ }
1240
+
1241
+ static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
1242
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1243
+
1244
+ /*********************** Initial Stage ***********************/
1245
+ /* N H W KH KW PH PW S D G GCin GCout */
1246
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
1247
+ /*********************** Bottleneck 1 ************************/
1248
+ /* N H W KH KW PH PW S D G GCin GCout */
1249
+ b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 16, 1, 1});
1250
+ b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 16});
1251
+ /*********************** Bottleneck 2 ************************/
1252
+ /* N H W KH KW PH PW S D G GCin GCout */
1253
+ b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1254
+ b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
1255
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 24});
1256
+ /*********************** Bottleneck 3 ************************/
1257
+ /* N H W KH KW PH PW S D G GCin GCout */
1258
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1259
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 72, 1, 1});
1260
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1261
+ /*********************** Bottleneck 4 ************************/
1262
+ /* N H W KH KW PH PW S D G GCin GCout */
1263
+ //b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1264
+ b->Args({1, 56, 56, 5, 5, 4, 4, 2, 1, 72, 1, 1});
1265
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1266
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1267
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 40});
1268
+ /*********************** Bottleneck 5 ************************/
1269
+ /* N H W KH KW PH PW S D G GCin GCout */
1270
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1271
+ b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1272
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1273
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1274
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
1275
+ /*********************** Bottleneck 6 ************************/
1276
+ /* N H W KH KW PH PW S D G GCin GCout */
1277
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1278
+ //b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1279
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1280
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1281
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
1282
+ /*********************** Bottleneck 7 ************************/
1283
+ /* N H W KH KW PH PW S D G GCin GCout */
1284
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1285
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 240, 1, 1});
1286
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 80});
1287
+ /*********************** Bottleneck 8 ************************/
1288
+ /* N H W KH KW PH PW S D G GCin GCout */
1289
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 200});
1290
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 200, 1, 1});
1291
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 200, 80});
1292
+ /*********************** Bottleneck 9 ************************/
1293
+ /* N H W KH KW PH PW S D G GCin GCout */
1294
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
1295
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
1296
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
1297
+ /********************** Bottleneck 10 ***********************/
1298
+ /* N H W KH KW PH PW S D G GCin GCout */
1299
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
1300
+ //b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
1301
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
1302
+ /********************** Bottleneck 11 ***********************/
1303
+ /* N H W KH KW PH PW S D G GCin GCout */
1304
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 480});
1305
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 480, 1, 1});
1306
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 480, 120});
1307
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 480});
1308
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 480, 112});
1309
+ /********************** Bottleneck 12 ***********************/
1310
+ /* N H W KH KW PH PW S D G GCin GCout */
1311
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
1312
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 672, 1, 1});
1313
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 672, 168});
1314
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 168, 672});
1315
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 672, 112});
1316
+ /********************** Bottleneck 13 ***********************/
1317
+ /* N H W KH KW PH PW S D G GCin GCout */
1318
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
1319
+ b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 672, 1, 1});
1320
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 672, 160});
1321
+ /********************** Bottleneck 14 ***********************/
1322
+ /* N H W KH KW PH PW S D G GCin GCout */
1323
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1324
+ b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
1325
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
1326
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
1327
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1328
+ /********************** Bottleneck 15 ***********************/
1329
+ /* N H W KH KW PH PW S D G GCin GCout */
1330
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1331
+ //b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
1332
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
1333
+ //b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
1334
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1335
+ /************************ Last Stage ***********************/
1336
+ /* N H W KH KW PH PW S D G GCin GCout */
1337
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1338
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 1280});
1339
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1001});
1340
+ }
1341
+
1342
+ // SqueezeNet 1.0
1343
+ static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
1344
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1345
+
1346
+ /************************** Conv 1 *************************/
1347
+ /* N H W KH KW PH PW S D G GCin GCout */
1348
+ b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 96});
1349
+ /************************** Fire 2 *************************/
1350
+ /* N H W KH KW PH PW S D G GCin GCout */
1351
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 96, 16});
1352
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1353
+ b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1354
+ /************************** Fire 3 *************************/
1355
+ /* N H W KH KW PH PW S D G GCin GCout */
1356
+ b->Args({1, 56, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
1357
+ //b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1358
+ //b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1359
+ /************************** Fire 4 *************************/
1360
+ /* N H W KH KW PH PW S D G GCin GCout */
1361
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 32});
1362
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1363
+ b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1364
+ /************************** Fire 5 *************************/
1365
+ /* N H W KH KW PH PW S D G GCin GCout */
1366
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
1367
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1368
+ b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1369
+ /************************** Fire 6 *************************/
1370
+ /* N H W KH KW PH PW S D G GCin GCout */
1371
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1372
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1373
+ b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1374
+ /************************** Fire 7 *************************/
1375
+ /* N H W KH KW PH PW S D G GCin GCout */
1376
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 48});
1377
+ //b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1378
+ //b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1379
+ /************************** Fire 8 *************************/
1380
+ /* N H W KH KW PH PW S D G GCin GCout */
1381
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1382
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1383
+ b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1384
+ /************************** Fire 9 *************************/
1385
+ /* N H W KH KW PH PW S D G GCin GCout */
1386
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
1387
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1388
+ b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1389
+ /************************* Conv 10 *************************/
1390
+ /* N H W KH KW PH PW S D G GCin GCout */
1391
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
1392
+ }
1393
+
1394
+ // SqueezeNet 1.1
1395
+ static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
1396
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1397
+
1398
+ /************************** Conv 1 *************************/
1399
+ /* N H W KH KW PH PW S D G GCin GCout */
1400
+ b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 64});
1401
+ /************************** Fire 2 *************************/
1402
+ /* N H W KH KW PH PW S D G GCin GCout */
1403
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 64, 16});
1404
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1405
+ b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1406
+ /************************** Fire 3 *************************/
1407
+ /* N H W KH KW PH PW S D G GCin GCout */
1408
+ b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
1409
+ //b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1410
+ //b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1411
+ /************************** Fire 4 *************************/
1412
+ /* N H W KH KW PH PW S D G GCin GCout */
1413
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 128, 32});
1414
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1415
+ b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1416
+ /************************** Fire 5 *************************/
1417
+ /* N H W KH KW PH PW S D G GCin GCout */
1418
+ b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
1419
+ //b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1420
+ //b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1421
+ /************************** Fire 6 *************************/
1422
+ /* N H W KH KW PH PW S D G GCin GCout */
1423
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1424
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1425
+ b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1426
+ /************************** Fire 7 *************************/
1427
+ /* N H W KH KW PH PW S D G GCin GCout */
1428
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 48});
1429
+ //b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1430
+ //b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1431
+ /************************** Fire 8 *************************/
1432
+ /* N H W KH KW PH PW S D G GCin GCout */
1433
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1434
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1435
+ b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1436
+ /************************** Fire 9 *************************/
1437
+ /* N H W KH KW PH PW S D G GCin GCout */
1438
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
1439
+ //b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1440
+ //b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1441
+ /************************* Conv 10 *************************/
1442
+ /* N H W KH KW PH PW S D G GCin GCout */
1443
+ b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
1444
+ }
1445
+
1446
+ static void InceptionV3(benchmark::internal::Benchmark* b) {
1447
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1448
+
1449
+ /* N H W KH KW PH PW S D G GCin GCout */
1450
+ b->Args({1, 299, 299, 3, 3, 0, 0, 2, 1, 1, 3, 32});
1451
+ b->Args({1, 149, 149, 3, 3, 0, 0, 1, 1, 1, 32, 32});
1452
+ b->Args({1, 147, 147, 3, 3, 2, 2, 1, 1, 1, 32, 64});
1453
+ b->Args({1, 73, 73, 1, 1, 0, 0, 1, 1, 1, 64, 80});
1454
+ b->Args({1, 73, 73, 3, 3, 0, 0, 1, 1, 1, 80, 192});
1455
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 64});
1456
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 48});
1457
+ b->Args({1, 35, 35, 5, 5, 4, 4, 1, 1, 1, 48, 64});
1458
+ b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 64, 96});
1459
+ b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 96, 96});
1460
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1461
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 64});
1462
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1463
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 64});
1464
+ b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 48});
1465
+ b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 288, 384});
1466
+ b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 96, 96});
1467
+ b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 192});
1468
+ b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 128});
1469
+ b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 128});
1470
+ b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 192});
1471
+ b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 128});
1472
+ b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 192});
1473
+ b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 160});
1474
+ b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 160});
1475
+ b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 192});
1476
+ b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 160});
1477
+ b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 192});
1478
+ b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 192, 192});
1479
+ b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 192, 192});
1480
+ b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 320});
1481
+ b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 192});
1482
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 320});
1483
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 384});
1484
+ b->Args({1, 8, 8, 1, 3, 0, 2, 1, 1, 1, 384, 384});
1485
+ b->Args({1, 8, 8, 3, 1, 2, 0, 1, 1, 1, 384, 384});
1486
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 448});
1487
+ b->Args({1, 8, 8, 3, 3, 2, 2, 1, 1, 1, 448, 384});
1488
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 192});
1489
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 320});
1490
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 384});
1491
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 448});
1492
+ b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 192});
1493
+ b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2048, 1001});
1494
+ }
1495
+
1496
+ static void ResNet18(benchmark::internal::Benchmark* b) {
1497
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1498
+
1499
+ /************************* Conv 1 *************************/
1500
+ /* N H W KH KW PH PW S D G GCin GCout */
1501
+ b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
1502
+ /************************ Conv 2.X ************************/
1503
+ /* N H W KH KW PH PW S D G GCin GCout */
1504
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1505
+ /************************ Conv 3.X ************************/
1506
+ /* N H W KH KW PH PW S D G GCin GCout */
1507
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 64, 128});
1508
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1509
+ b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 64, 128});
1510
+ /************************ Conv 4.X ************************/
1511
+ /* N H W KH KW PH PW S D G GCin GCout */
1512
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 128, 256});
1513
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1514
+ b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 128, 256});
1515
+ /************************ Conv 5.X ************************/
1516
+ /* N H W KH KW PH PW S D G GCin GCout */
1517
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 256, 512});
1518
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1519
+ b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 256, 512});
1520
+ }
1521
+
1522
+ static void ResNet50(benchmark::internal::Benchmark* b) {
1523
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1524
+
1525
+ /************************* Conv 1 *************************/
1526
+ /* N H W KH KW PH PW S D G GCin GCout */
1527
+ b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
1528
+ /************************ Conv 2.1 ************************/
1529
+ /* N H W KH KW PH PW S D G GCin GCout */
1530
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 64});
1531
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1532
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1533
+ //b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1534
+ /************************ Conv 2.X ************************/
1535
+ /* N H W KH KW PH PW S D G GCin GCout */
1536
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 64});
1537
+ //b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1538
+ //b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1539
+ /************************ Conv 3.1 ************************/
1540
+ /* N H W KH KW PH PW S D G GCin GCout */
1541
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 128});
1542
+ b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 128, 128});
1543
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
1544
+ b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 256, 512});
1545
+ /************************ Conv 3.X ************************/
1546
+ /* N H W KH KW PH PW S D G GCin GCout */
1547
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 128});
1548
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1549
+ //b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
1550
+ /************************ Conv 4.1 ************************/
1551
+ /* N H W KH KW PH PW S D G GCin GCout */
1552
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 256});
1553
+ b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 256, 256});
1554
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
1555
+ b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 512, 1024});
1556
+ /************************ Conv 4.X ************************/
1557
+ /* N H W KH KW PH PW S D G GCin GCout */
1558
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 256});
1559
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1560
+ //b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
1561
+ /************************ Conv 5.1 ************************/
1562
+ /* N H W KH KW PH PW S D G GCin GCout */
1563
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 512});
1564
+ b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 512, 512});
1565
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
1566
+ b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 1024, 2048});
1567
+ /************************ Conv 5.X ************************/
1568
+ /* N H W KH KW PH PW S D G GCin GCout */
1569
+ b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 2048, 512});
1570
+ b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1571
+ //b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
1572
+ }
1573
+
1574
+ static void VGG(benchmark::internal::Benchmark* b) {
1575
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1576
+
1577
+ /************************* Conv 1.1 ************************/
1578
+ /* N H W KH KW PH PW S D G GCin GCout */
1579
+ b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 3, 64});
1580
+ /************************* Conv 1.2 ************************/
1581
+ /* N H W KH KW PH PW S D G GCin GCout */
1582
+ b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1583
+
1584
+ /************************* Conv 2.1 ************************/
1585
+ /* N H W KH KW PH PW S D G GCin GCout */
1586
+ b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 64, 128});
1587
+ /************************* Conv 2.2 ************************/
1588
+ /* N H W KH KW PH PW S D G GCin GCout */
1589
+ b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1590
+
1591
+ /************************* Conv 3.1 ************************/
1592
+ /* N H W KH KW PH PW S D G GCin GCout */
1593
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 128, 256});
1594
+ /************************* Conv 3.2 ************************/
1595
+ /* N H W KH KW PH PW S D G GCin GCout */
1596
+ b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1597
+ /************************* Conv 3.3 ************************/
1598
+ /* N H W KH KW PH PW S D G GCin GCout */
1599
+ b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 256});
1600
+
1601
+ /************************* Conv 4.1 ************************/
1602
+ /* N H W KH KW PH PW S D G GCin GCout */
1603
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 256, 512});
1604
+ /************************* Conv 4.2 ************************/
1605
+ /* N H W KH KW PH PW S D G GCin GCout */
1606
+ b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1607
+ /************************* Conv 4.3 ************************/
1608
+ /* N H W KH KW PH PW S D G GCin GCout */
1609
+ b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1610
+
1611
+ /************************* Conv 5.X ************************/
1612
+ /* N H W KH KW PH PW S D G GCin GCout */
1613
+ b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1614
+ /************************* Conv 5.3 ************************/
1615
+ /* N H W KH KW PH PW S D G GCin GCout */
1616
+ b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1617
+ }
1618
+
1619
+ // SRCNN (9-1-5)
1620
+ static void SRCNN915(benchmark::internal::Benchmark* b) {
1621
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1622
+
1623
+ /* N H W KH KW PH PW S D G GCin GCout */
1624
+ b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1625
+ b->Args({1, 376, 376, 1, 1, 0, 0, 1, 1, 1, 64, 32});
1626
+ b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1627
+ }
1628
+
1629
+ // SRCNN (9-3-5)
1630
+ static void SRCNN935(benchmark::internal::Benchmark* b) {
1631
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1632
+
1633
+ /* N H W KH KW PH PW S D G GCin GCout */
1634
+ b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1635
+ b->Args({1, 376, 376, 3, 3, 0, 0, 1, 1, 1, 64, 32});
1636
+ b->Args({1, 374, 374, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1637
+ }
1638
+
1639
+ // SRCNN (9-5-5)
1640
+ static void SRCNN955(benchmark::internal::Benchmark* b) {
1641
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1642
+
1643
+ /* N H W KH KW PH PW S D G GCin GCout */
1644
+ b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1645
+ b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 64, 32});
1646
+ b->Args({1, 372, 372, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1647
+ }
1648
+
1649
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1650
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1651
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1652
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1653
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1654
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1655
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1656
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1657
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1658
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1659
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1660
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1661
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1662
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1663
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1664
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1665
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1666
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1667
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
1668
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1669
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1670
+ BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1671
+
1672
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1673
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1674
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1675
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1676
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1677
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1678
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1679
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1680
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1681
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1682
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1683
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1684
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1685
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1686
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1687
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1688
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1689
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1690
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1691
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1692
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1693
+ BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1694
+
1695
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1696
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1697
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1698
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1699
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1700
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1701
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1702
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1703
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1704
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1705
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1706
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1707
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1708
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1709
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1710
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1711
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1712
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1713
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1714
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1715
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1716
+ BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1717
+
1718
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1719
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1720
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1721
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1722
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1723
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1724
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1725
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1726
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1727
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1728
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1729
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1730
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1731
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1732
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1733
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1734
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1735
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1736
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1737
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1738
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1739
+ BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1740
+
1741
+ #ifdef BENCHMARK_TENSORFLOW_LITE
1742
+ BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1743
+ BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1744
+ BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1745
+ BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1746
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1747
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1748
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1749
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1750
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1751
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1752
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1753
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1754
+ BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1755
+ BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1756
+ BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1757
+ BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1758
+ BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1759
+ BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1760
+ BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1761
+ BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1762
+ BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1763
+ BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1764
+ #endif // BENCHMARK_TENSORFLOW_LITE
1765
+
1766
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
1767
+ BENCHMARK_MAIN();
1768
+ #endif
bench/cs16-bfly4.cc ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ #include <algorithm>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <numeric>
11
+ #include <vector>
12
+
13
+ #include "bench/utils.h"
14
+ #include <benchmark/benchmark.h>
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/fft.h>
20
+ #include <xnnpack/microfnptr.h>
21
+ #include <xnnpack/microparams-init.h>
22
+
23
+
24
+ void cs16_bfly4(
25
+ benchmark::State& state,
26
+ xnn_cs16_bfly4_ukernel_fn bfly4,
27
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
28
+ {
29
+ if ((isa_check != nullptr) && !isa_check(state)) {
30
+ return;
31
+ }
32
+ const size_t fft_size = state.range(0);
33
+ const size_t batch = state.range(1);
34
+ const size_t samples = state.range(2);
35
+ const size_t stride = state.range(3);
36
+
37
+ assert(fft_size == samples * stride * 4); // 4 for bfly4.
38
+
39
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> output(fft_size * 2);
40
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(fft_size * 3 / 4 * 2);
41
+
42
+ std::iota(output.begin(), output.end(), 0);
43
+ std::iota(twiddle.begin(), twiddle.end(), 0);
44
+
45
+ for (auto _ : state) {
46
+ bfly4(batch, samples * sizeof(int16_t) * 2, output.data(), twiddle.data(), stride * sizeof(int16_t) * 2);
47
+ }
48
+
49
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
50
+ if (cpu_frequency != 0) {
51
+ state.counters["cpufreq"] = cpu_frequency;
52
+ }
53
+ }
54
+
55
+ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
56
+ {
57
+ b->ArgNames({"fft_size", "batch", "samples", "stride"});
58
+ b->Args({256, 1, 1, 64});
59
+ b->Args({256, 4, 1, 64});
60
+ b->Args({256, 1, 4, 16});
61
+ b->Args({256, 4, 4, 16});
62
+ b->Args({256, 1, 16, 4});
63
+ b->Args({256, 4, 16, 4});
64
+ b->Args({256, 1, 64, 1});
65
+ }
66
+
67
+ static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b)
68
+ {
69
+ b->ArgNames({"fft_size", "batch", "samples", "stride"});
70
+ b->Args({256, 1, 1, 64});
71
+ b->Args({256, 4, 1, 64});
72
+ b->Args({256, 16, 1, 64});
73
+ b->Args({256, 64, 1, 64});
74
+ }
75
+ static void BenchmarkSamples4KernelSize(benchmark::internal::Benchmark* b)
76
+ {
77
+ b->ArgNames({"fft_size", "batch", "samples", "stride"});
78
+ b->Args({256, 1, 4, 16});
79
+ b->Args({256, 4, 4, 16});
80
+ b->Args({256, 16, 4, 16});
81
+ }
82
+
83
+ #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
84
+ BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x1, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1)
85
+ ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
86
+ BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x2, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2)
87
+ ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
88
+ BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x4, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4)
89
+ ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
90
+ #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
91
+
92
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
93
+ BENCHMARK_CAPTURE(cs16_bfly4, samples1__neon, xnn_cs16_bfly4_samples1_ukernel__neon)
94
+ ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
95
+ BENCHMARK_CAPTURE(cs16_bfly4, samples4__neon, xnn_cs16_bfly4_samples4_ukernel__neon)
96
+ ->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
97
+ BENCHMARK_CAPTURE(cs16_bfly4, neon_x1, xnn_cs16_bfly4_ukernel__neon_x1)
98
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
99
+ BENCHMARK_CAPTURE(cs16_bfly4, neon_x4, xnn_cs16_bfly4_ukernel__neon_x4)
100
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
101
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
102
+
103
+ BENCHMARK_CAPTURE(cs16_bfly4, samples1__scalar, xnn_cs16_bfly4_samples1_ukernel__scalar)
104
+ ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
105
+ BENCHMARK_CAPTURE(cs16_bfly4, samples4__scalar, xnn_cs16_bfly4_samples4_ukernel__scalar)
106
+ ->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
107
+ BENCHMARK_CAPTURE(cs16_bfly4, scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)
108
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
109
+ BENCHMARK_CAPTURE(cs16_bfly4, scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)
110
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
111
+ BENCHMARK_CAPTURE(cs16_bfly4, scalar_x4, xnn_cs16_bfly4_ukernel__scalar_x4)
112
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
113
+
114
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
115
+ BENCHMARK_MAIN();
116
+ #endif
bench/cs16-fftr.cc ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <numeric>
10
+ #include <vector>
11
+
12
+ #include "bench/utils.h"
13
+ #include <benchmark/benchmark.h>
14
+
15
+ #include <xnnpack.h>
16
+ #include <xnnpack/aligned-allocator.h>
17
+ #include <xnnpack/common.h>
18
+ #include <xnnpack/fft.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+
22
+
23
+ void cs16_fftr(
24
+ benchmark::State& state,
25
+ xnn_cs16_fftr_ukernel_fn fftr,
26
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
27
+ {
28
+ if ((isa_check != nullptr) && !isa_check(state)) {
29
+ return;
30
+ }
31
+ const size_t samples = state.range(0);
32
+
33
+ assert(samples % 2 == 0);
34
+ const size_t sample_size = samples * 2 + 2;
35
+
36
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> data(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t));
37
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(samples);
38
+
39
+ std::iota(data.begin(), data.end(), 0);
40
+ std::iota(twiddle.begin(), twiddle.end(), 2);
41
+
42
+ for (auto _ : state) {
43
+ fftr(samples, data.data(), twiddle.data());
44
+ }
45
+
46
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
47
+ if (cpu_frequency != 0) {
48
+ state.counters["cpufreq"] = cpu_frequency;
49
+ }
50
+ }
51
+
52
+ static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
53
+ {
54
+ b->ArgNames({"samples"});
55
+ b->Args({256});
56
+ b->Args({1024});
57
+ }
58
+ #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
59
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x1, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
60
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x4, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
61
+ #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
62
+
63
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
64
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_neon_x4, xnn_cs16_fftr_ukernel__neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
65
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
66
+
67
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x1, xnn_cs16_fftr_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
68
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x2, xnn_cs16_fftr_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
69
+ BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x4, xnn_cs16_fftr_ukernel__scalar_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
70
+
71
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
72
+ BENCHMARK_MAIN();
73
+ #endif
bench/cs16-vsquareabs.cc ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <complex>
9
+ #include <functional>
10
+ #include <numeric>
11
+ #include <vector>
12
+
13
+ #include "bench/utils.h"
14
+ #include <benchmark/benchmark.h>
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vsquareabs.h>
22
+
23
+
24
+ void cs16_vsquareabs(
25
+ benchmark::State& state,
26
+ xnn_cs16_vsquareabs_ukernel_fn vsquareabs,
27
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
28
+ {
29
+ if ((isa_check != nullptr) && !isa_check(state)) {
30
+ return;
31
+ }
32
+ const size_t num_elements = state.range(0);
33
+
34
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(
35
+ num_elements * 2 + XNN_EXTRA_BYTES / sizeof(int16_t));
36
+ std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> output(num_elements);
37
+ std::iota(input.begin(), input.end(), 0);
38
+ std::iota(output.begin(), output.end(), 0);
39
+
40
+ for (auto _ : state) {
41
+ vsquareabs(num_elements * sizeof(int16_t) * 2, input.data(), output.data());
42
+ }
43
+
44
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
45
+ if (cpu_frequency != 0) {
46
+ state.counters["cpufreq"] = cpu_frequency;
47
+ }
48
+
49
+ const size_t elements_per_iteration = num_elements;
50
+ state.counters["elements"] =
51
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
52
+
53
+ const size_t bytes_per_iteration = num_elements * (sizeof(std::complex<int16_t>) + sizeof(uint32_t));
54
+ state.counters["bytes"] =
55
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
56
+ }
57
+
58
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
59
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x4,
60
+ xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4,
61
+ benchmark::utils::CheckNEON)
62
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
63
+ ->UseRealTime();
64
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x8,
65
+ xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8,
66
+ benchmark::utils::CheckNEON)
67
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
68
+ ->UseRealTime();
69
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x12,
70
+ xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12,
71
+ benchmark::utils::CheckNEON)
72
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
73
+ ->UseRealTime();
74
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x16,
75
+ xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16,
76
+ benchmark::utils::CheckNEON)
77
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
78
+ ->UseRealTime();
79
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
80
+
81
+ #if XNN_ARCH_HEXAGON
82
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x2,
83
+ xnn_cs16_vsquareabs_ukernel__hexagon_x2)
84
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
85
+ ->UseRealTime();
86
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x4,
87
+ xnn_cs16_vsquareabs_ukernel__hexagon_x4)
88
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
89
+ ->UseRealTime();
90
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x6,
91
+ xnn_cs16_vsquareabs_ukernel__hexagon_x6)
92
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
93
+ ->UseRealTime();
94
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x8,
95
+ xnn_cs16_vsquareabs_ukernel__hexagon_x8)
96
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
97
+ ->UseRealTime();
98
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x10,
99
+ xnn_cs16_vsquareabs_ukernel__hexagon_x10)
100
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
101
+ ->UseRealTime();
102
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x12,
103
+ xnn_cs16_vsquareabs_ukernel__hexagon_x12)
104
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
105
+ ->UseRealTime();
106
+ #endif // XNN_ARCH_HEXAGON
107
+
108
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x1,
109
+ xnn_cs16_vsquareabs_ukernel__scalar_x1)
110
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
111
+ ->UseRealTime();
112
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x2,
113
+ xnn_cs16_vsquareabs_ukernel__scalar_x2)
114
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
115
+ ->UseRealTime();
116
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x3,
117
+ xnn_cs16_vsquareabs_ukernel__scalar_x3)
118
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
119
+ ->UseRealTime();
120
+ BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x4,
121
+ xnn_cs16_vsquareabs_ukernel__scalar_x4)
122
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
123
+ ->UseRealTime();
124
+
125
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
126
+ BENCHMARK_MAIN();
127
+ #endif
bench/dconv.h ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #pragma once
10
+
11
+ #include <benchmark/benchmark.h>
12
+
13
+
14
+ #define BENCHMARK_DCONV(conv_fn) \
15
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1/v2")->Apply(MobileNetConvArguments)->UseRealTime(); \
16
+ BENCHMARK_CAPTURE(conv_fn, mobilenet_v3, "MobileNet v3")->Apply(MobileNetV3ConvArguments)->UseRealTime(); \
17
+ BENCHMARK_CAPTURE(conv_fn, shufflenet, "ShuffleNet v1/v2")->Apply(ShuffleNetConvArguments)->UseRealTime(); \
18
+ BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime();
19
+
20
+
21
+ // ShuffleNet v1/v2.
22
+ static void ShuffleNetConvArguments(benchmark::internal::Benchmark* b) {
23
+ b->ArgNames({"H", "W", "Cout"});
24
+
25
+ /********* Conv 1 ********/
26
+ /* H W GCout */
27
+ b->Args({224, 224, 24});
28
+ }
29
+
30
+ // MobileNet v1/v2.
31
+ static void MobileNetConvArguments(benchmark::internal::Benchmark* b) {
32
+ b->ArgNames({"H", "W", "Cout"});
33
+
34
+ /* H W GCout */
35
+ b->Args({224, 224, 32});
36
+ }
37
+
38
+ // MobileNet v3 Small/Large.
39
+ static void MobileNetV3ConvArguments(benchmark::internal::Benchmark* b) {
40
+ b->ArgNames({"H", "W", "Cout"});
41
+
42
+ /******************* Initial Stage *******************/
43
+ /* H W GCout */
44
+ b->Args({224, 224, 16});
45
+ }
46
+
47
+ // SqueezeNet 1.1
48
+ static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
49
+ b->ArgNames({"H", "W", "GCout"});
50
+
51
+ /*********************** Conv 1 **********************/
52
+ /* H W GCout */
53
+ b->Args({224, 224, 64});
54
+ }
bench/deconvolution.cc ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cfloat>
9
+ #include <cmath>
10
+ #include <functional>
11
+ #include <limits>
12
+ #include <memory>
13
+ #include <random>
14
+ #include <string>
15
+ #include <vector>
16
+
17
+ #include <xnnpack.h>
18
+
19
+ #include <benchmark/benchmark.h>
20
+ #ifdef BENCHMARK_TENSORFLOW_LITE
21
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
22
+ #include "tensorflow/lite/interpreter.h"
23
+ #include "tensorflow/lite/kernels/register.h"
24
+ #include "tensorflow/lite/model.h"
25
+ #include "tensorflow/lite/schema/schema_generated.h"
26
+ #include "tensorflow/lite/version.h"
27
+ #endif // BENCHMARK_TENSORFLOW_LITE */
28
+ #include "bench/utils.h"
29
+
30
+ void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
31
+ const size_t batch_size = state.range(0);
32
+ const size_t input_height = state.range(1);
33
+ const size_t input_width = state.range(2);
34
+ const size_t kernel_height = state.range(3);
35
+ const size_t kernel_width = state.range(4);
36
+ const size_t padding_height = state.range(5);
37
+ const size_t padding_width = state.range(6);
38
+ const size_t adjustment = state.range(7);
39
+ const size_t stride_height = state.range(8);
40
+ const size_t stride_width = state.range(9);
41
+ const size_t dilation = state.range(10);
42
+ const size_t input_channels = state.range(11);
43
+ const size_t output_channels = state.range(12);
44
+
45
+ std::random_device random_device;
46
+ auto rng = std::mt19937(random_device());
47
+ auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
48
+ auto u8rng = std::bind(
49
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
50
+ std::ref(rng));
51
+
52
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
53
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
54
+ const size_t padding_left = padding_width / 2;
55
+ const size_t padding_top = padding_height / 2;
56
+ const size_t padding_right = padding_width - padding_left;
57
+ const size_t padding_bottom = padding_height - padding_top;
58
+ const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
59
+ const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
60
+
61
+ std::vector<uint8_t> input(batch_size * input_height * input_width * input_channels);
62
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
63
+ std::vector<uint8_t> kernel(output_channels * kernel_height * kernel_width * input_channels);
64
+ std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
65
+ std::vector<int32_t> bias(output_channels);
66
+ std::generate(bias.begin(), bias.end(), std::ref(i32rng));
67
+ const size_t output_elements = batch_size * output_height * output_width * output_channels;
68
+
69
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
70
+ if (status != xnn_status_success) {
71
+ state.SkipWithError("failed to initialize XNNPACK");
72
+ return;
73
+ }
74
+
75
+ const size_t num_buffers = 1 +
76
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
77
+ sizeof(float) * (kernel.size() + bias.size() + output_elements));
78
+ std::vector<uint8_t> output(output_elements * num_buffers);
79
+
80
+ std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
81
+ for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
82
+ status = xnn_create_deconvolution2d_nhwc_qu8(
83
+ padding_top, padding_right, padding_bottom, padding_left,
84
+ kernel_height, kernel_width,
85
+ stride_height, stride_width,
86
+ dilation, dilation,
87
+ /*groups=*/1, input_channels, output_channels,
88
+ /*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
89
+ 127, 0.5f, 127, 0.5f,
90
+ kernel.data(), bias.data(),
91
+ 127, 0.5f, 0, 255,
92
+ 0 /* flags */,
93
+ nullptr, nullptr,
94
+ &deconvolution_op);
95
+ if (status != xnn_status_success) {
96
+ state.SkipWithError("failed to create QINT8 Deconvolution operator");
97
+ return;
98
+ }
99
+ }
100
+
101
+ for (size_t i = 0; i < deconvolution_operators.size(); i++) {
102
+ status = xnn_reshape_deconvolution2d_nhwc_qu8(
103
+ deconvolution_operators[i],
104
+ batch_size, input_height, input_width,
105
+ 0 /* height adjustment */, 0 /* width adjustment */,
106
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
107
+ /*threadpool=*/nullptr);
108
+ if (status != xnn_status_success) {
109
+ state.SkipWithError("failed to setup QINT8 Deconvolution operator");
110
+ return;
111
+ }
112
+ }
113
+
114
+ for (size_t i = 0; i < deconvolution_operators.size(); i++) {
115
+ status = xnn_setup_deconvolution2d_nhwc_qu8(
116
+ deconvolution_operators[i],
117
+ input.data(), output.data() + i * output_elements);
118
+ if (status != xnn_status_success) {
119
+ state.SkipWithError("failed to setup QINT8 Deconvolution operator");
120
+ return;
121
+ }
122
+ }
123
+
124
+ size_t buffer_index = 0;
125
+ for (auto _ : state) {
126
+ state.PauseTiming();
127
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
128
+ buffer_index = (buffer_index + 1) % num_buffers;
129
+ state.ResumeTiming();
130
+
131
+ status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
132
+ if (status != xnn_status_success) {
133
+ state.SkipWithError("failed to run QINT8 Deconvolution operator");
134
+ return;
135
+ }
136
+ }
137
+
138
+ for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
139
+ status = xnn_delete_operator(deconvolution_op);
140
+ if (status != xnn_status_success) {
141
+ state.SkipWithError("failed to delete QINT8 Deconvolution operator");
142
+ return;
143
+ }
144
+ deconvolution_op = nullptr;
145
+ }
146
+
147
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
148
+ if (cpu_frequency != 0) {
149
+ state.counters["cpufreq"] = cpu_frequency;
150
+ }
151
+
152
+ state.counters["OPS"] = benchmark::Counter(
153
+ uint64_t(state.iterations()) * 2 *
154
+ batch_size * input_width * input_width *
155
+ input_channels * output_channels *
156
+ kernel_height * kernel_width,
157
+ benchmark::Counter::kIsRate);
158
+ }
159
+
160
+ void xnnpack_deconvolution_f32(benchmark::State& state, const char* net) {
161
+ const size_t batch_size = state.range(0);
162
+ const size_t input_height = state.range(1);
163
+ const size_t input_width = state.range(2);
164
+ const size_t kernel_height = state.range(3);
165
+ const size_t kernel_width = state.range(4);
166
+ const size_t padding_height = state.range(5);
167
+ const size_t padding_width = state.range(6);
168
+ const size_t adjustment = state.range(7);
169
+ const size_t stride_height = state.range(8);
170
+ const size_t stride_width = state.range(9);
171
+ const size_t dilation = state.range(10);
172
+ const size_t input_channels = state.range(11);
173
+ const size_t output_channels = state.range(12);
174
+
175
+ std::random_device random_device;
176
+ auto rng = std::mt19937(random_device());
177
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
178
+
179
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
180
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
181
+ const size_t padding_left = padding_width / 2;
182
+ const size_t padding_top = padding_height / 2;
183
+ const size_t padding_right = padding_width - padding_left;
184
+ const size_t padding_bottom = padding_height - padding_top;
185
+ const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
186
+ const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
187
+
188
+ std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
189
+ batch_size * input_height * input_width * input_channels);
190
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
191
+ std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
192
+ std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
193
+ std::vector<float> bias(output_channels);
194
+ std::generate(bias.begin(), bias.end(), std::ref(f32rng));
195
+ const size_t output_elements = batch_size * output_height * output_width * output_channels;
196
+
197
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
198
+ if (status != xnn_status_success) {
199
+ state.SkipWithError("failed to initialize XNNPACK");
200
+ return;
201
+ }
202
+
203
+ const size_t num_buffers = 1 +
204
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
205
+ sizeof(float) * (kernel.size() + bias.size() + output_elements));
206
+ std::vector<float> output(output_elements * num_buffers);
207
+
208
+ std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
209
+ for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
210
+ status = xnn_create_deconvolution2d_nhwc_f32(
211
+ padding_top, padding_right, padding_bottom, padding_left,
212
+ kernel_height, kernel_width,
213
+ stride_height, stride_width,
214
+ dilation, dilation,
215
+ /*groups=*/1, input_channels, output_channels,
216
+ /*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
217
+ kernel.data(), bias.data(),
218
+ -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
219
+ 0 /* flags */,
220
+ nullptr,
221
+ nullptr,
222
+ &deconvolution_op);
223
+ if (status != xnn_status_success) {
224
+ state.SkipWithError("failed to create FP32 Deconvolution operator");
225
+ return;
226
+ }
227
+ }
228
+
229
+ for (size_t i = 0; i < deconvolution_operators.size(); i++) {
230
+ status = xnn_reshape_deconvolution2d_nhwc_f32(
231
+ deconvolution_operators[i],
232
+ batch_size, input_height, input_width,
233
+ 0 /* height adjustment */, 0 /* width adjustment */,
234
+ /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
235
+ /*threadpool=*/nullptr);
236
+ if (status != xnn_status_success) {
237
+ state.SkipWithError("failed to setup QINT8 Deconvolution operator");
238
+ return;
239
+ }
240
+ }
241
+
242
+ for (size_t i = 0; i < deconvolution_operators.size(); i++) {
243
+ status = xnn_setup_deconvolution2d_nhwc_f32(
244
+ deconvolution_operators[i],
245
+ input.data(), output.data() + i * output_elements);
246
+ if (status != xnn_status_success) {
247
+ state.SkipWithError("failed to setup QINT8 Deconvolution operator");
248
+ return;
249
+ }
250
+ }
251
+
252
+ size_t buffer_index = 0;
253
+ for (auto _ : state) {
254
+ state.PauseTiming();
255
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
256
+ buffer_index = (buffer_index + 1) % num_buffers;
257
+ state.ResumeTiming();
258
+
259
+ status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
260
+ if (status != xnn_status_success) {
261
+ state.SkipWithError("failed to run FP32 Deconvolution operator");
262
+ return;
263
+ }
264
+ }
265
+
266
+ for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
267
+ status = xnn_delete_operator(deconvolution_op);
268
+ if (status != xnn_status_success) {
269
+ state.SkipWithError("failed to delete FP32 Deconvolution operator");
270
+ return;
271
+ }
272
+ deconvolution_op = nullptr;
273
+ }
274
+
275
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
276
+ if (cpu_frequency != 0) {
277
+ state.counters["cpufreq"] = cpu_frequency;
278
+ }
279
+
280
+ state.counters["FLOPS"] = benchmark::Counter(
281
+ uint64_t(state.iterations()) * 2 *
282
+ batch_size * input_width * input_width *
283
+ input_channels * output_channels *
284
+ kernel_height * kernel_width,
285
+ benchmark::Counter::kIsRate);
286
+ }
287
+
288
+ #ifdef BENCHMARK_TENSORFLOW_LITE
289
+ void tflite_deconvolution_f32(benchmark::State& state, const char* net) {
290
+ const size_t batch_size = state.range(0);
291
+ const size_t input_height = state.range(1);
292
+ const size_t input_width = state.range(2);
293
+ const size_t kernel_height = state.range(3);
294
+ const size_t kernel_width = state.range(4);
295
+ const size_t padding_height = state.range(5);
296
+ const size_t padding_width = state.range(6);
297
+ const size_t adjustment = state.range(7);
298
+ const size_t stride_height = state.range(8);
299
+ const size_t stride_width = state.range(9);
300
+ const size_t dilation = state.range(10);
301
+ const size_t input_channels = state.range(11);
302
+ const size_t output_channels = state.range(12);
303
+
304
+ if (dilation != 1) {
305
+ state.SkipWithError("dilated deconvolution is not supported");
306
+ return;
307
+ }
308
+
309
+ std::random_device random_device;
310
+ auto rng = std::mt19937(random_device());
311
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
312
+
313
+ tflite::Padding tf_padding = tflite::Padding_VALID;
314
+ if (padding_width == kernel_width - stride_width && padding_height == kernel_height - stride_height) {
315
+ tf_padding = tflite::Padding_SAME;
316
+ } else if (padding_width == 0 && padding_height == 0) {
317
+ tf_padding = tflite::Padding_VALID;
318
+ } else {
319
+ state.SkipWithError("unsupported padding");
320
+ return;
321
+ }
322
+
323
+ const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + kernel_height, padding_height) - padding_height;
324
+ const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + kernel_width, padding_width) - padding_width;
325
+
326
+ std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
327
+ std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
328
+
329
+ flatbuffers::FlatBufferBuilder builder;
330
+ flatbuffers::Offset<tflite::OperatorCode> operator_code =
331
+ CreateOperatorCode(builder, tflite::BuiltinOperator_TRANSPOSE_CONV, 0);
332
+
333
+ flatbuffers::Offset<tflite::TransposeConvOptions> transpose_conv_options = CreateTransposeConvOptions(
334
+ builder,
335
+ tf_padding,
336
+ static_cast<int32_t>(stride_width), static_cast<int32_t>(stride_height));
337
+
338
+ const std::array<int32_t, 4> input_shape{{
339
+ static_cast<int32_t>(batch_size),
340
+ static_cast<int32_t>(input_height),
341
+ static_cast<int32_t>(input_width),
342
+ static_cast<int32_t>(input_channels)
343
+ }};
344
+ const std::array<int32_t, 4> output_shape{{
345
+ static_cast<int32_t>(batch_size),
346
+ static_cast<int32_t>(output_height),
347
+ static_cast<int32_t>(output_width),
348
+ static_cast<int32_t>(output_channels)
349
+ }};
350
+ const std::array<int32_t, 4> filter_shape{{
351
+ static_cast<int32_t>(output_channels),
352
+ static_cast<int32_t>(kernel_height),
353
+ static_cast<int32_t>(kernel_width),
354
+ static_cast<int32_t>(input_channels)
355
+ }};
356
+ const std::array<int32_t, 1> output_shape_shape{{ 4 }};
357
+
358
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
359
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
360
+ tflite::CreateBuffer(builder, builder.CreateVector(
361
+ reinterpret_cast<const uint8_t*>(kernel.data()),
362
+ sizeof(float) * kernel.size())),
363
+ tflite::CreateBuffer(builder, builder.CreateVector(
364
+ reinterpret_cast<const uint8_t*>(output_shape.data()),
365
+ sizeof(int32_t) * output_shape.size())),
366
+ }};
367
+
368
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
369
+ tflite::CreateTensor(builder,
370
+ builder.CreateVector<int32_t>(output_shape_shape.data(), output_shape_shape.size()),
371
+ tflite::TensorType_INT32,
372
+ 2 /* buffer id */),
373
+ tflite::CreateTensor(builder,
374
+ builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
375
+ tflite::TensorType_FLOAT32,
376
+ 1 /* buffer id */),
377
+ tflite::CreateTensor(builder,
378
+ builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
379
+ tflite::TensorType_FLOAT32),
380
+ tflite::CreateTensor(builder,
381
+ builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
382
+ tflite::TensorType_FLOAT32),
383
+ }};
384
+
385
+ const std::array<int32_t, 3> op_inputs{{ 0, 1, 2 }};
386
+ const std::array<int32_t, 1> op_outputs{{ 3 }};
387
+ flatbuffers::Offset<tflite::Operator> op = CreateOperator(
388
+ builder,
389
+ 0 /* opcode_index */,
390
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
391
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
392
+ tflite::BuiltinOptions_TransposeConvOptions,
393
+ transpose_conv_options.Union());
394
+
395
+ const std::array<int32_t, 1> graph_inputs{{ 2 }};
396
+ const std::array<int32_t, 1> graph_outputs{{ 3 }};
397
+ flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
398
+ builder,
399
+ builder.CreateVector(tensors.data(), tensors.size()),
400
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
401
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
402
+ builder.CreateVector(&op, 1),
403
+ builder.CreateString("TransposeConv subgraph"));
404
+
405
+ const flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("TransposeConv model");
406
+
407
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
408
+ TFLITE_SCHEMA_VERSION,
409
+ builder.CreateVector(&operator_code, 1),
410
+ builder.CreateVector(&subgraph, 1),
411
+ description,
412
+ builder.CreateVector(buffers.data(), buffers.size()));
413
+
414
+ builder.Finish(model_buffer);
415
+
416
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
417
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
418
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
419
+ std::unique_ptr<tflite::Interpreter> interpreter;
420
+ if (interpreterBuilder(&interpreter) != kTfLiteOk) {
421
+ state.SkipWithError("failed to create TFLite interpreter");
422
+ return;
423
+ }
424
+ if (interpreter == nullptr) {
425
+ state.SkipWithError("TFLite interpreter is null");
426
+ return;
427
+ }
428
+ interpreter->SetNumThreads(1);
429
+
430
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
431
+ state.SkipWithError("failed to allocate tensors");
432
+ return;
433
+ }
434
+
435
+ std::generate(
436
+ interpreter->typed_tensor<float>(2),
437
+ interpreter->typed_tensor<float>(2) + batch_size * input_channels * input_height * input_width,
438
+ std::ref(f32rng));
439
+
440
+ for (auto _ : state) {
441
+ state.PauseTiming();
442
+ benchmark::utils::WipeCache();
443
+ benchmark::utils::PrefetchToL1(
444
+ interpreter->typed_tensor<float>(2),
445
+ batch_size * input_channels * input_height * input_width * sizeof(float));
446
+ state.ResumeTiming();
447
+
448
+ if (interpreter->Invoke() != kTfLiteOk) {
449
+ state.SkipWithError("failed to invoke TFLite interpreter");
450
+ return;
451
+ }
452
+ }
453
+
454
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
455
+ if (cpu_frequency != 0) {
456
+ state.counters["cpufreq"] = cpu_frequency;
457
+ }
458
+
459
+ state.counters["FLOPS"] = benchmark::Counter(
460
+ uint64_t(state.iterations()) * 2 *
461
+ batch_size * input_width * input_width *
462
+ input_channels * output_channels *
463
+ kernel_height * kernel_width,
464
+ benchmark::Counter::kIsRate);
465
+
466
+ interpreter.reset();
467
+ }
468
+ #endif // BENCHMARK_TENSORFLOW_LITE
469
+
470
+ // FCN-32 model (PASCAL VOC version).
471
+ // We assume CIF image (352x288) on model input / output.
472
+ static void FCN32(benchmark::internal::Benchmark* b) {
473
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
474
+
475
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
476
+ b->Args({1, 9, 11, 64, 64, 0, 0, 0, 32, 32, 1, 21, 21});
477
+ }
478
+
479
+ // FCN-16 model (PASCAL VOC version).
480
+ // We assume CIF image (352x288) on model input / output.
481
+ static void FCN16(benchmark::internal::Benchmark* b) {
482
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
483
+
484
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
485
+ b->Args({1, 9, 11, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
486
+ b->Args({1, 18, 22, 32, 32, 0, 0, 0, 16, 16, 1, 21, 21});
487
+ }
488
+
489
+ // FCN-8 model (PASCAL VOC version).
490
+ // We assume CIF image (352x288) on model input / output.
491
+ static void FCN8(benchmark::internal::Benchmark* b) {
492
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
493
+
494
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
495
+ b->Args({1, 9, 11, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
496
+ b->Args({1, 18, 22, 4, 4, 0, 0, 0, 2, 2, 1, 21, 21});
497
+ b->Args({1, 36, 44, 16, 16, 0, 0, 0, 8, 8, 1, 21, 21});
498
+ }
499
+
500
+ static void ENet(benchmark::internal::Benchmark* b) {
501
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
502
+
503
+ /*********************** Bottleneck 4.0 ***********************/
504
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
505
+ b->Args({1, 64, 64, 3, 3, 2, 2, 1, 2, 2, 1, 32, 32});
506
+ /*********************** Bottleneck 5.0 ***********************/
507
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
508
+ b->Args({1, 128, 128, 3, 3, 2, 2, 1, 2, 2, 1, 16, 16});
509
+ /******************* Final Full Convolution *******************/
510
+ /* N H W KH KW PH PW A SH SH D Cin Cout */
511
+ b->Args({1, 256, 256, 2, 2, 0, 0, 0, 2, 2, 1, 16, 12});
512
+ }
513
+
514
+ static void ESPNet(benchmark::internal::Benchmark* b) {
515
+ b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
516
+
517
+ /* N H W KH KW PH PW A SH SW D Cin Cout */
518
+ b->Args({1, 64, 128, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
519
+ b->Args({1, 128, 256, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
520
+ b->Args({1, 256, 512, 2, 2, 0, 0, 0, 2, 2, 1, 20, 20});
521
+ }
522
+
523
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn32, "FCN-32")
524
+ ->Apply(FCN32)
525
+ ->UseRealTime();
526
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn16, "FCN-16")
527
+ ->Apply(FCN16)
528
+ ->UseRealTime();
529
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn8, "FCN-8")
530
+ ->Apply(FCN8)
531
+ ->UseRealTime();
532
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, enet, "ENet")
533
+ ->Apply(ENet)
534
+ ->UseRealTime();
535
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, espnet, "ESPNet")
536
+ ->Apply(ESPNet)
537
+ ->UseRealTime();
538
+
539
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn32, "FCN-32")
540
+ ->Apply(FCN32)
541
+ ->UseRealTime();
542
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn16, "FCN-16")
543
+ ->Apply(FCN16)
544
+ ->UseRealTime();
545
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn8, "FCN-8")
546
+ ->Apply(FCN8)
547
+ ->UseRealTime();
548
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, enet, "ENet")
549
+ ->Apply(ENet)
550
+ ->UseRealTime();
551
+ BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, espnet, "ESPNet")
552
+ ->Apply(ESPNet)
553
+ ->UseRealTime();
554
+
555
+ #ifdef BENCHMARK_TENSORFLOW_LITE
556
+ BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn32, "FCN-32")
557
+ ->Apply(FCN32)
558
+ ->UseRealTime();
559
+ BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn16, "FCN-16")
560
+ ->Apply(FCN16)
561
+ ->UseRealTime();
562
+ BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn8, "FCN-8")
563
+ ->Apply(FCN8)
564
+ ->UseRealTime();
565
+ BENCHMARK_CAPTURE(tflite_deconvolution_f32, enet, "ENet")
566
+ ->Apply(ENet)
567
+ ->UseRealTime();
568
+ BENCHMARK_CAPTURE(tflite_deconvolution_f32, espnet, "ESPNet")
569
+ ->Apply(ESPNet)
570
+ ->UseRealTime();
571
+ #endif // BENCHMARK_TENSORFLOW_LITE
572
+
573
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
574
+ BENCHMARK_MAIN();
575
+ #endif
bench/dwconv.h ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #pragma once
10
+
11
+ #include <benchmark/benchmark.h>
12
+
13
+
14
+ #define BENCHMARK_DWCONV(dwconv_fn) \
15
+ BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1DWConvArguments)->UseRealTime(); \
16
+ BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2DWConvArguments)->UseRealTime(); \
17
+ BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallDWConvArguments)->UseRealTime(); \
18
+ BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeDWConvArguments)->UseRealTime(); \
19
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1DWConvArguments)->UseRealTime(); \
20
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2DWConvArguments)->UseRealTime(); \
21
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3DWConvArguments)->UseRealTime(); \
22
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4DWConvArguments)->UseRealTime(); \
23
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8DWConvArguments)->UseRealTime(); \
24
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05DWConvArguments)->UseRealTime(); \
25
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10DWConvArguments)->UseRealTime(); \
26
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15DWConvArguments)->UseRealTime(); \
27
+ BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20DWConvArguments)->UseRealTime();
28
+
29
+
30
+ // ShuffleNet v1 with 1 group.
31
+ static void ShuffleNetV1G1DWConvArguments(benchmark::internal::Benchmark* b) {
32
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
33
+
34
+ /********* Stage 2: stride-2 unit *********/
35
+ /* H W KH KW PH PW S D G */
36
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 36});
37
+ /********* Stage 2: stride-1 units ********/
38
+ /* H W KH KW PH PW S D G */
39
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 36});
40
+ /********* Stage 3: stride-2 unit *********/
41
+ /* H W KH KW PH PW S D G */
42
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 72});
43
+ /********* Stage 3: stride-1 units ********/
44
+ /* H W KH KW PH PW S D G */
45
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 72});
46
+ /********* Stage 4: stride-2 unit *********/
47
+ /* H W KH KW PH PW S D G */
48
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 144});
49
+ /********* Stage 4: stride-1 units ********/
50
+ /* H W KH KW PH PW S D G */
51
+ b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 144});
52
+ }
53
+
54
+ // ShuffleNet v1 with 2 groups.
55
+ static void ShuffleNetV1G2DWConvArguments(benchmark::internal::Benchmark* b) {
56
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
57
+
58
+ /********* Stage 2: stride-2 unit *********/
59
+ /* H W KH KW PH PW S D G */
60
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 50});
61
+ /********* Stage 2: stride-1 units ********/
62
+ /* H W KH KW PH PW S D G */
63
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 50});
64
+ /********* Stage 3: stride-2 unit *********/
65
+ /* H W KH KW PH PW S D G */
66
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 100});
67
+ /********* Stage 3: stride-1 units ********/
68
+ /* H W KH KW PH PW S D G */
69
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 100});
70
+ /********* Stage 4: stride-2 unit *********/
71
+ /* H W KH KW PH PW S D G */
72
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 200});
73
+ /********* Stage 4: stride-1 units ********/
74
+ /* H W KH KW PH PW S D G */
75
+ b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 200});
76
+ }
77
+
78
+ // ShuffleNet v1 with 3 groups.
79
+ static void ShuffleNetV1G3DWConvArguments(benchmark::internal::Benchmark* b) {
80
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
81
+
82
+ /********* Stage 2: stride-2 unit **********/
83
+ /* H W KH KW PH PW S D G */
84
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 60});
85
+ /********* Stage 2: stride-1 units *********/
86
+ /* H W KH KW PH PW S D G */
87
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 60});
88
+ /********* Stage 3: stride-2 unit **********/
89
+ /* H W KH KW PH PW S D G */
90
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 120});
91
+ /********* Stage 3: stride-1 units *********/
92
+ /* H W KH KW PH PW S D G */
93
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 120});
94
+ /********* Stage 4: stride-2 unit **********/
95
+ /* H W KH KW PH PW S D G */
96
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 240});
97
+ /********* Stage 4: stride-1 units *********/
98
+ /* H W KH KW PH PW S D G */
99
+ b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 240});
100
+ }
101
+
102
+ // ShuffleNet v1 with 4 groups.
103
+ static void ShuffleNetV1G4DWConvArguments(benchmark::internal::Benchmark* b) {
104
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
105
+
106
+ /********* Stage 2: stride-2 unit *********/
107
+ /* H W KH KW PH PW S D G */
108
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 68});
109
+ /********* Stage 2: stride-1 units ********/
110
+ /* H W KH KW PH PW S D G */
111
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 68});
112
+ /********* Stage 3: stride-2 unit *********/
113
+ /* H W KH KW PH PW S D G */
114
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 136});
115
+ /********* Stage 3: stride-1 units ********/
116
+ /* H W KH KW PH PW S D G */
117
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 136});
118
+ /********* Stage 4: stride-2 unit *********/
119
+ /* H W KH KW PH PW S D G */
120
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 272});
121
+ /********* Stage 4: stride-1 units ********/
122
+ /* H W KH KW PH PW S D G */
123
+ b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 272});
124
+ }
125
+
126
+ // ShuffleNet v1 with 8 groups.
127
+ static void ShuffleNetV1G8DWConvArguments(benchmark::internal::Benchmark* b) {
128
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
129
+
130
+ /********* Stage 2: stride-2 unit *********/
131
+ /* H W KH KW PH PW S D G */
132
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 96});
133
+ /********* Stage 2: stride-1 units ********/
134
+ /* H W KH KW PH PW S D G */
135
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 96});
136
+ /********* Stage 3: stride-2 unit *********/
137
+ /* H W KH KW PH PW S D G */
138
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 192});
139
+ /********* Stage 3: stride-1 units ********/
140
+ /* H W KH KW PH PW S D G */
141
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 192});
142
+ /********* Stage 4: stride-2 unit *********/
143
+ /* H W KH KW PH PW S D G */
144
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 384});
145
+ /********* Stage 4: stride-1 units ********/
146
+ /* H W KH KW PH PW S D G */
147
+ b->Args({ 7, 7, 3, 3, 2, 2, 2, 1, 384});
148
+ }
149
+
150
+ // ShuffleNet v2 (0.5X scale)
151
+ static void ShuffleNetV2X05DWConvArguments(benchmark::internal::Benchmark* b) {
152
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
153
+
154
+ /**************** Stage 2 *****************/
155
+ /* H W KH KW PH PW S D G */
156
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
157
+ b->Args({28, 28, 3, 3, 2, 2, 1, 1, 24});
158
+ /**************** Stage 3 *****************/
159
+ /* H W KH KW PH PW S D G */
160
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 48});
161
+ b->Args({14, 14, 3, 3, 2, 2, 1, 1, 48});
162
+ /**************** Stage 4 *****************/
163
+ /* H W KH KW PH PW S D G */
164
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 96});
165
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 96});
166
+ }
167
+
168
+ // ShuffleNet v2 (1.0X scale)
169
+ static void ShuffleNetV2X10DWConvArguments(benchmark::internal::Benchmark* b) {
170
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
171
+
172
+ /**************** Stage 2 *****************/
173
+ /* H W KH KW PH PW S D G */
174
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
175
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 58});
176
+ b->Args({28, 28, 3, 3, 2, 2, 1, 1, 58});
177
+ /**************** Stage 3 *****************/
178
+ /* H W KH KW PH PW S D G */
179
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 116});
180
+ b->Args({14, 14, 3, 3, 2, 2, 1, 1, 116});
181
+ /**************** Stage 4 *****************/
182
+ /* H W KH KW PH PW S D G */
183
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 232});
184
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 232});
185
+ }
186
+
187
+ // ShuffleNet v2 (1.5X scale)
188
+ static void ShuffleNetV2X15DWConvArguments(benchmark::internal::Benchmark* b) {
189
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
190
+
191
+ /**************** Stage 2 *****************/
192
+ /* H W KH KW PH PW S D G */
193
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
194
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 88});
195
+ b->Args({28, 28, 3, 3, 2, 2, 1, 1, 88});
196
+ /**************** Stage 3 *****************/
197
+ /* H W KH KW PH PW S D G */
198
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 176});
199
+ b->Args({14, 14, 3, 3, 2, 2, 1, 1, 176});
200
+ /**************** Stage 4 *****************/
201
+ /* H W KH KW PH PW S D G */
202
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 352});
203
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 352});
204
+ }
205
+
206
+ // ShuffleNet v2 (2.0X scale)
207
+ static void ShuffleNetV2X20DWConvArguments(benchmark::internal::Benchmark* b) {
208
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
209
+
210
+ /***************** Stage 2 ****************/
211
+ /* H W KH KW PH PW S D G */
212
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 24});
213
+ b->Args({56, 56, 3, 3, 2, 2, 2, 1, 122});
214
+ b->Args({28, 28, 3, 3, 2, 2, 1, 1, 122});
215
+ /***************** Stage 3 ****************/
216
+ /* H W KH KW PH PW S D G */
217
+ b->Args({28, 28, 3, 3, 2, 2, 2, 1, 244});
218
+ b->Args({14, 14, 3, 3, 2, 2, 1, 1, 244});
219
+ /***************** Stage 4 ****************/
220
+ /* H W KH KW PH PW S D G */
221
+ b->Args({14, 14, 3, 3, 2, 2, 2, 1, 488});
222
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 488});
223
+ }
224
+
225
+ static void MobileNetV1DWConvArguments(benchmark::internal::Benchmark* b) {
226
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
227
+
228
+ /* H W KH KW PH PW S D G */
229
+ b->Args({112, 112, 3, 3, 2, 2, 1, 1, 32});
230
+ b->Args({112, 112, 3, 3, 2, 2, 2, 1, 64});
231
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 128});
232
+ b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 128});
233
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 256});
234
+ b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 256});
235
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 512});
236
+ b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 512});
237
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 1024});
238
+ }
239
+
240
+ static void MobileNetV2DWConvArguments(benchmark::internal::Benchmark* b) {
241
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
242
+
243
+ /**************** Bottleneck 1 ***************/
244
+ /* H W KH KW PH PW S D G */
245
+ b->Args({112, 112, 3, 3, 2, 2, 1, 1, 32});
246
+
247
+ /**************** Bottleneck 2 ***************/
248
+ /* H W KH KW PH PW S D G */
249
+ b->Args({112, 112, 3, 3, 2, 2, 2, 1, 96});
250
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 144});
251
+
252
+ /**************** Bottleneck 3 ***************/
253
+ /* H W KH KW PH PW S D G */
254
+ b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 144});
255
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 192});
256
+ //b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 192});
257
+
258
+ /**************** Bottleneck 4 ***************/
259
+ /* H W KH KW PH PW S D G */
260
+ b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 192});
261
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
262
+ //b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
263
+ //b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
264
+
265
+ /**************** Bottleneck 5 ***************/
266
+ /* H W KH KW PH PW S D G */
267
+ //b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 384});
268
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 576});
269
+ //b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 576});
270
+
271
+ /**************** Bottleneck 6 ***************/
272
+ /* H W KH KW PH PW S D G */
273
+ b->Args({ 14, 14, 3, 3, 2, 2, 2, 1, 576});
274
+ b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
275
+ //b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
276
+
277
+ /**************** Bottleneck 7 ***************/
278
+ /* H W KH KW PH PW S D G */
279
+ //b->Args({ 7, 7, 3, 3, 2, 2, 1, 1, 960});
280
+ }
281
+
282
+ static void MobileNetV3SmallDWConvArguments(benchmark::internal::Benchmark* b) {
283
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
284
+
285
+ /*************** Bottleneck 1 ***************/
286
+ /* H W KH KW PH PW S D G */
287
+ b->Args({112, 112, 3, 3, 2, 2, 2, 1, 16});
288
+ /*************** Bottleneck 2 ***************/
289
+ /* H W KH KW PH PW S D G */
290
+ b->Args({ 56, 56, 3, 3, 2, 2, 2, 1, 72});
291
+ /*************** Bottleneck 3 ***************/
292
+ /* H W KH KW PH PW S D G */
293
+ b->Args({ 28, 28, 3, 3, 2, 2, 1, 1, 88});
294
+ /*************** Bottleneck 4 ***************/
295
+ /* H W KH KW PH PW S D G */
296
+ b->Args({ 28, 28, 5, 5, 4, 4, 2, 1, 96});
297
+ /*************** Bottleneck 5 ***************/
298
+ /* H W KH KW PH PW S D G */
299
+ b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 240});
300
+ /*************** Bottleneck 6 ***************/
301
+ /* H W KH KW PH PW S D G */
302
+ //b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 240});
303
+ /*************** Bottleneck 7 ***************/
304
+ /* H W KH KW PH PW S D G */
305
+ b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 120});
306
+ /*************** Bottleneck 8 ***************/
307
+ /* H W KH KW PH PW S D G */
308
+ b->Args({ 14, 14, 5, 5, 4, 4, 1, 1, 144});
309
+ /*************** Bottleneck 9 ***************/
310
+ /* H W KH KW PH PW S D G */
311
+ b->Args({ 14, 14, 5, 5, 4, 4, 2, 1, 288});
312
+ /*************** Bottleneck 10 **************/
313
+ /* H W KH KW PH PW S D G */
314
+ b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 576});
315
+ /*************** Bottleneck 11 **************/
316
+ /* H W KH KW PH PW S D G */
317
+ //b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 576});
318
+ }
319
+
320
+ static void MobileNetV3LargeDWConvArguments(benchmark::internal::Benchmark* b) {
321
+ b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
322
+
323
+ /*************** Bottleneck 1 ***************/
324
+ /* H W KH KW PH PW S D G */
325
+ b->Args({112, 112, 3, 3, 2, 2, 1, 1, 16});
326
+ /*************** Bottleneck 2 ***************/
327
+ /* H W KH KW PH PW S D G */
328
+ b->Args({112, 112, 3, 3, 2, 2, 2, 1, 64});
329
+ /*************** Bottleneck 3 ***************/
330
+ /* H W KH KW PH PW S D G */
331
+ b->Args({ 56, 56, 3, 3, 2, 2, 1, 1, 72});
332
+ /*************** Bottleneck 4 ***************/
333
+ /* H W KH KW PH PW S D G */
334
+ b->Args({ 56, 56, 5, 5, 4, 4, 2, 1, 72});
335
+ /*************** Bottleneck 5 ***************/
336
+ /* H W KH KW PH PW S D G */
337
+ b->Args({ 28, 28, 5, 5, 4, 4, 1, 1, 120});
338
+ /*************** Bottleneck 6 ***************/
339
+ /* H W KH KW PH PW S D G */
340
+ //b->Args({ 28, 28, 5, 5, 4, 4, 1, 1, 120});
341
+ /*************** Bottleneck 7 ***************/
342
+ /* H W KH KW PH PW S D G */
343
+ b->Args({ 28, 28, 3, 3, 2, 2, 2, 1, 240});
344
+ /*************** Bottleneck 8 ***************/
345
+ /* H W KH KW PH PW S D G */
346
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 200});
347
+ /*************** Bottleneck 9 ***************/
348
+ /* H W KH KW PH PW S D G */
349
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 184});
350
+ /*************** Bottleneck 10 **************/
351
+ /* H W KH KW PH PW S D G */
352
+ //b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 184});
353
+ /*************** Bottleneck 11 **************/
354
+ /* H W KH KW PH PW S D G */
355
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 480});
356
+ /*************** Bottleneck 12 **************/
357
+ /* H W KH KW PH PW S D G */
358
+ b->Args({ 14, 14, 3, 3, 2, 2, 1, 1, 672});
359
+ /*************** Bottleneck 13 **************/
360
+ /* H W KH KW PH PW S D G */
361
+ b->Args({ 14, 14, 5, 5, 4, 4, 2, 1, 672});
362
+ /*************** Bottleneck 14 **************/
363
+ /* H W KH KW PH PW S D G */
364
+ b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 960});
365
+ /*************** Bottleneck 15 **************/
366
+ /* H W KH KW PH PW S D G */
367
+ //b->Args({ 7, 7, 5, 5, 4, 4, 1, 1, 960});
368
+ }
bench/elu.cc ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2020 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <limits>
11
+ #include <memory>
12
+ #include <random>
13
+ #include <vector>
14
+
15
+ #include <xnnpack.h>
16
+
17
+ #include <fp16/fp16.h>
18
+ #include "bench/utils.h"
19
+ #include <benchmark/benchmark.h>
20
+ #ifdef BENCHMARK_TENSORFLOW_LITE
21
+ #include "flatbuffers/include/flatbuffers/flatbuffers.h"
22
+ #include "tensorflow/lite/interpreter.h"
23
+ #include "tensorflow/lite/kernels/register.h"
24
+ #include "tensorflow/lite/model.h"
25
+ #include "tensorflow/lite/schema/schema_generated.h"
26
+ #include "tensorflow/lite/version.h"
27
+ #endif // BENCHMARK_TENSORFLOW_LITE
28
+
29
+
30
+ static void xnnpack_elu_f16(benchmark::State& state) {
31
+ const size_t batch_size = state.range(0);
32
+
33
+ std::random_device random_device;
34
+ auto rng = std::mt19937(random_device());
35
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
36
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
37
+
38
+ std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
39
+ std::vector<uint16_t> output(batch_size);
40
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
41
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
42
+
43
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
44
+ if (status != xnn_status_success) {
45
+ state.SkipWithError("failed to initialize XNNPACK");
46
+ return;
47
+ }
48
+
49
+ xnn_operator_t elu_op = nullptr;
50
+ status = xnn_create_elu_nc_f16(
51
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
52
+ 1.0f /* alpha */, 0 /* flags */, &elu_op);
53
+ if (status != xnn_status_success || elu_op == nullptr) {
54
+ state.SkipWithError("failed to create ELU operator");
55
+ return;
56
+ }
57
+
58
+ status = xnn_reshape_elu_nc_f16(elu_op, batch_size, /*threadpool=*/nullptr);
59
+ if (status != xnn_status_success) {
60
+ state.SkipWithError("failed to reshape ELU operator");
61
+ return;
62
+ }
63
+
64
+ status = xnn_setup_elu_nc_f16(elu_op, input.data(), output.data());
65
+ if (status != xnn_status_success) {
66
+ state.SkipWithError("failed to setup ELU operator");
67
+ return;
68
+ }
69
+
70
+ for (auto _ : state) {
71
+ status = xnn_run_operator(elu_op, nullptr /* thread pool */);
72
+ if (status != xnn_status_success) {
73
+ state.SkipWithError("failed to run ELU operator");
74
+ return;
75
+ }
76
+ }
77
+
78
+ status = xnn_delete_operator(elu_op);
79
+ if (status != xnn_status_success) {
80
+ state.SkipWithError("failed to delete ELU operator");
81
+ return;
82
+ }
83
+
84
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
85
+ if (cpu_frequency != 0) {
86
+ state.counters["cpufreq"] = cpu_frequency;
87
+ }
88
+
89
+ state.counters["elements"] =
90
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
91
+
92
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
93
+ state.counters["bytes"] =
94
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
95
+ }
96
+
97
+ static void xnnpack_elu_f32(benchmark::State& state) {
98
+ const size_t batch_size = state.range(0);
99
+
100
+ std::random_device random_device;
101
+ auto rng = std::mt19937(random_device());
102
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
103
+
104
+ std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
105
+ std::vector<float> output(batch_size);
106
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
107
+ std::fill(output.begin(), output.end(), std::nanf(""));
108
+
109
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
110
+ if (status != xnn_status_success) {
111
+ state.SkipWithError("failed to initialize XNNPACK");
112
+ return;
113
+ }
114
+
115
+ xnn_operator_t elu_op = nullptr;
116
+ status = xnn_create_elu_nc_f32(
117
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
118
+ 1.0f /* alpha */, 0 /* flags */, &elu_op);
119
+ if (status != xnn_status_success || elu_op == nullptr) {
120
+ state.SkipWithError("failed to create ELU operator");
121
+ return;
122
+ }
123
+
124
+ status = xnn_reshape_elu_nc_f32(elu_op, batch_size, /*threadpool=*/nullptr);
125
+ if (status != xnn_status_success) {
126
+ state.SkipWithError("failed to reshape ELU operator");
127
+ return;
128
+ }
129
+
130
+ status = xnn_setup_elu_nc_f32(elu_op, input.data(), output.data());
131
+ if (status != xnn_status_success) {
132
+ state.SkipWithError("failed to setup ELU operator");
133
+ return;
134
+ }
135
+
136
+ for (auto _ : state) {
137
+ status = xnn_run_operator(elu_op, nullptr /* thread pool */);
138
+ if (status != xnn_status_success) {
139
+ state.SkipWithError("failed to run ELU operator");
140
+ return;
141
+ }
142
+ }
143
+
144
+ status = xnn_delete_operator(elu_op);
145
+ if (status != xnn_status_success) {
146
+ state.SkipWithError("failed to delete ELU operator");
147
+ return;
148
+ }
149
+
150
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
151
+ if (cpu_frequency != 0) {
152
+ state.counters["cpufreq"] = cpu_frequency;
153
+ }
154
+
155
+ state.counters["elements"] =
156
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
157
+
158
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
159
+ state.counters["bytes"] =
160
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
161
+ }
162
+
163
+ static void xnnpack_elu_qs8(benchmark::State& state) {
164
+ const size_t batch_size = state.range(0);
165
+
166
+ std::random_device random_device;
167
+ auto rng = std::mt19937(random_device());
168
+ auto i8rng = std::bind(
169
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
170
+ std::ref(rng));
171
+
172
+ std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
173
+ std::vector<int8_t> output(batch_size);
174
+ std::generate(input.begin(), input.end(), std::ref(i8rng));
175
+ std::fill(output.begin(), output.end(), INT8_C(0xA5));
176
+
177
+ xnn_status status = xnn_initialize(nullptr /* allocator */);
178
+ if (status != xnn_status_success) {
179
+ state.SkipWithError("failed to initialize XNNPACK");
180
+ return;
181
+ }
182
+
183
+ xnn_operator_t elu_op = nullptr;
184
+ status = xnn_create_elu_nc_qs8(
185
+ 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
186
+ 1.0f /* alpha */,
187
+ 0 /* input zero point */, 1.0f /* input scale */,
188
+ 0 /* output zero point */, 1.0f /* output scale */,
189
+ std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
190
+ 0 /* flags */, &elu_op);
191
+ if (status != xnn_status_success || elu_op == nullptr) {
192
+ state.SkipWithError("failed to create ELU operator");
193
+ return;
194
+ }
195
+
196
+ status = xnn_reshape_elu_nc_qs8(elu_op, batch_size, /*threadpool=*/nullptr);
197
+ if (status != xnn_status_success) {
198
+ state.SkipWithError("failed to reshape ELU operator");
199
+ return;
200
+ }
201
+
202
+ status = xnn_setup_elu_nc_qs8(elu_op, input.data(), output.data());
203
+ if (status != xnn_status_success) {
204
+ state.SkipWithError("failed to setup ELU operator");
205
+ return;
206
+ }
207
+
208
+ for (auto _ : state) {
209
+ status = xnn_run_operator(elu_op, nullptr /* thread pool */);
210
+ if (status != xnn_status_success) {
211
+ state.SkipWithError("failed to run ELU operator");
212
+ return;
213
+ }
214
+ }
215
+
216
+ status = xnn_delete_operator(elu_op);
217
+ if (status != xnn_status_success) {
218
+ state.SkipWithError("failed to delete ELU operator");
219
+ return;
220
+ }
221
+
222
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
223
+ if (cpu_frequency != 0) {
224
+ state.counters["cpufreq"] = cpu_frequency;
225
+ }
226
+
227
+ state.counters["elements"] =
228
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
229
+
230
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
231
+ state.counters["bytes"] =
232
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
233
+ }
234
+
235
+ #ifdef BENCHMARK_TENSORFLOW_LITE
236
+ static void tflite_elu_f32(benchmark::State& state) {
237
+ const size_t batch_size = state.range(0);
238
+
239
+ std::random_device random_device;
240
+ auto rng = std::mt19937(random_device());
241
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
242
+
243
+ flatbuffers::FlatBufferBuilder builder;
244
+ const flatbuffers::Offset<tflite::OperatorCode> operator_code =
245
+ CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
246
+
247
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
248
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
249
+ }};
250
+
251
+ const std::array<int32_t, 1> shape{{
252
+ static_cast<int32_t>(batch_size)
253
+ }};
254
+
255
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
256
+ tflite::CreateTensor(builder,
257
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
258
+ tflite::TensorType_FLOAT32),
259
+ tflite::CreateTensor(builder,
260
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
261
+ tflite::TensorType_FLOAT32),
262
+ }};
263
+
264
+ const std::array<int32_t, 1> op_inputs{{ 0 }};
265
+ const std::array<int32_t, 1> op_outputs{{ 1 }};
266
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
267
+ builder,
268
+ 0 /* opcode_index */,
269
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
270
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
271
+
272
+ const std::array<int32_t, 1> graph_inputs{{ 0 }};
273
+ const std::array<int32_t, 1> graph_outputs{{ 1 }};
274
+ const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
275
+ builder,
276
+ builder.CreateVector(tensors.data(), tensors.size()),
277
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
278
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
279
+ builder.CreateVector(&op, 1));
280
+
281
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
282
+ TFLITE_SCHEMA_VERSION,
283
+ builder.CreateVector(&operator_code, 1),
284
+ builder.CreateVector(&subgraph, 1),
285
+ builder.CreateString("ELU model"),
286
+ builder.CreateVector(buffers.data(), buffers.size()));
287
+
288
+ builder.Finish(model_buffer);
289
+
290
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
291
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
292
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
293
+ std::unique_ptr<tflite::Interpreter> interpreter;
294
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
295
+ state.SkipWithError("failed to create TFLite interpreter");
296
+ return;
297
+ }
298
+ interpreter->SetNumThreads(1);
299
+
300
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
301
+ state.SkipWithError("failed to allocate tensors");
302
+ return;
303
+ }
304
+
305
+ std::generate(
306
+ interpreter->typed_tensor<float>(0),
307
+ interpreter->typed_tensor<float>(0) + batch_size,
308
+ std::ref(f32rng));
309
+
310
+ for (auto _ : state) {
311
+ if (interpreter->Invoke() != kTfLiteOk) {
312
+ state.SkipWithError("failed to invoke TFLite interpreter");
313
+ return;
314
+ }
315
+ }
316
+
317
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
318
+ if (cpu_frequency != 0) {
319
+ state.counters["cpufreq"] = cpu_frequency;
320
+ }
321
+
322
+ state.counters["elements"] =
323
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
324
+
325
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
326
+ state.counters["bytes"] =
327
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
328
+
329
+ interpreter.reset();
330
+ }
331
+
332
+ static void tflite_elu_qs8(benchmark::State& state) {
333
+ const size_t batch_size = state.range(0);
334
+
335
+ std::random_device random_device;
336
+ auto rng = std::mt19937(random_device());
337
+ auto i8rng = std::bind(
338
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
339
+ std::ref(rng));
340
+
341
+ flatbuffers::FlatBufferBuilder builder;
342
+ const flatbuffers::Offset<tflite::OperatorCode> operator_code =
343
+ CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
344
+
345
+ const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
346
+ tflite::CreateBuffer(builder, builder.CreateVector({})),
347
+ }};
348
+
349
+ const std::array<int32_t, 1> shape{{
350
+ static_cast<int32_t>(batch_size)
351
+ }};
352
+
353
+ const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
354
+ tflite::CreateTensor(builder,
355
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
356
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
357
+ tflite::CreateQuantizationParameters(builder,
358
+ 0 /*min*/, 0 /*max*/,
359
+ builder.CreateVector<float>({1.0f /* scale */}),
360
+ builder.CreateVector<int64_t>({1 /* zero point */}))),
361
+ tflite::CreateTensor(builder,
362
+ builder.CreateVector<int32_t>(shape.data(), shape.size()),
363
+ tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
364
+ tflite::CreateQuantizationParameters(builder,
365
+ 0 /*min*/, 0 /*max*/,
366
+ builder.CreateVector<float>({1.0f /* scale */}),
367
+ builder.CreateVector<int64_t>({1 /* zero point */}))),
368
+ }};
369
+
370
+ const std::array<int32_t, 1> op_inputs{{ 0 }};
371
+ const std::array<int32_t, 1> op_outputs{{ 1 }};
372
+ flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
373
+ builder,
374
+ 0 /* opcode_index */,
375
+ builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
376
+ builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
377
+
378
+ const std::array<int32_t, 1> graph_inputs{{ 0 }};
379
+ const std::array<int32_t, 1> graph_outputs{{ 1 }};
380
+ const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
381
+ builder,
382
+ builder.CreateVector(tensors.data(), tensors.size()),
383
+ builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
384
+ builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
385
+ builder.CreateVector(&op, 1));
386
+
387
+ const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
388
+ TFLITE_SCHEMA_VERSION,
389
+ builder.CreateVector(&operator_code, 1),
390
+ builder.CreateVector(&subgraph, 1),
391
+ builder.CreateString("ELU model"),
392
+ builder.CreateVector(buffers.data(), buffers.size()));
393
+
394
+ builder.Finish(model_buffer);
395
+
396
+ const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
397
+ tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
398
+ tflite::InterpreterBuilder interpreterBuilder(model, resolver);
399
+ std::unique_ptr<tflite::Interpreter> interpreter;
400
+ if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
401
+ state.SkipWithError("failed to create TFLite interpreter");
402
+ return;
403
+ }
404
+ interpreter->SetNumThreads(1);
405
+
406
+ if (interpreter->AllocateTensors() != kTfLiteOk) {
407
+ state.SkipWithError("failed to allocate tensors");
408
+ return;
409
+ }
410
+
411
+ std::generate(
412
+ interpreter->typed_tensor<int8_t>(0),
413
+ interpreter->typed_tensor<int8_t>(0) + batch_size,
414
+ std::ref(i8rng));
415
+
416
+ for (auto _ : state) {
417
+ if (interpreter->Invoke() != kTfLiteOk) {
418
+ state.SkipWithError("failed to invoke TFLite interpreter");
419
+ return;
420
+ }
421
+ }
422
+
423
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
424
+ if (cpu_frequency != 0) {
425
+ state.counters["cpufreq"] = cpu_frequency;
426
+ }
427
+
428
+ state.counters["elements"] =
429
+ benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
430
+
431
+ const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
432
+ state.counters["bytes"] =
433
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
434
+
435
+ interpreter.reset();
436
+ }
437
+ #endif // BENCHMARK_TENSORFLOW_LITE
438
+
439
+ BENCHMARK(xnnpack_elu_f16)
440
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
441
+ ->UseRealTime();
442
+ BENCHMARK(xnnpack_elu_f32)
443
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
444
+ ->UseRealTime();
445
+ BENCHMARK(xnnpack_elu_qs8)
446
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
447
+ ->UseRealTime();
448
+
449
+ #ifdef BENCHMARK_TENSORFLOW_LITE
450
+ BENCHMARK(tflite_elu_f32)
451
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
452
+ ->UseRealTime();
453
+ BENCHMARK(tflite_elu_qs8)
454
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
455
+ ->UseRealTime();
456
+ #endif // BENCHMARK_TENSORFLOW_LITE
457
+
458
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
459
+ BENCHMARK_MAIN();
460
+ #endif
bench/end2end.cc ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <memory>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+
15
+ #include "bench/utils.h"
16
+
17
+ #include <xnnpack.h>
18
+ #include <xnnpack/models.h>
19
+
20
+
21
+ static void End2EndBenchmark(
22
+ benchmark::State& state,
23
+ models::ExecutionPlanFactory model_factory)
24
+ {
25
+ if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
26
+ state.SkipWithError("failed to initialize XNNPACK");
27
+ return;
28
+ }
29
+
30
+ const size_t num_threads = state.range(0);
31
+ std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
32
+ pthreadpool_create(num_threads), pthreadpool_destroy);
33
+
34
+ auto execution_plan = model_factory(threadpool.get());
35
+ if (execution_plan.empty()) {
36
+ state.SkipWithError("failed to create a model");
37
+ return;
38
+ }
39
+
40
+ for (auto _ : state) {
41
+ for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
42
+ xnn_status status = xnn_run_operator(op.get(), threadpool.get());
43
+ if (status != xnn_status_success) {
44
+ state.SkipWithError("failed to run a model");
45
+ return;
46
+ }
47
+ }
48
+ }
49
+
50
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
51
+ if (cpu_frequency != 0) {
52
+ state.counters["cpufreq"] = cpu_frequency;
53
+ }
54
+ }
55
+
56
+ static void FP32MobileNetV1(benchmark::State& state) {
57
+ End2EndBenchmark(state, models::FP32MobileNetV1);
58
+ }
59
+
60
+ static void FP32MobileNetV2(benchmark::State& state) {
61
+ End2EndBenchmark(state, models::FP32MobileNetV2);
62
+ }
63
+
64
+ static void FP32MobileNetV3Large(benchmark::State& state) {
65
+ End2EndBenchmark(state, models::FP32MobileNetV3Large);
66
+ }
67
+
68
+ static void FP32MobileNetV3Small(benchmark::State& state) {
69
+ End2EndBenchmark(state, models::FP32MobileNetV3Small);
70
+ }
71
+
72
+ #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
73
+ static void FP32MobileNetV3SmallFused(benchmark::State& state) {
74
+ End2EndBenchmark(state, models::FP32MobileNetV3SmallFused);
75
+ }
76
+ #endif // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
77
+
78
+ static void FP32Sparse80MobileNetV1(benchmark::State& state) {
79
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
80
+ return models::FP32SparseMobileNetV1(0.8f, threadpool);
81
+ });
82
+ }
83
+
84
+ static void FP32Sparse80MobileNetV2(benchmark::State& state) {
85
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
86
+ return models::FP32SparseMobileNetV2(0.8f, threadpool);
87
+ });
88
+ }
89
+
90
+ static void FP32Sparse80MobileNetV3Large(benchmark::State& state) {
91
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
92
+ return models::FP32SparseMobileNetV3Large(0.8f, threadpool);
93
+ });
94
+ }
95
+
96
+ static void FP32Sparse80MobileNetV3Small(benchmark::State& state) {
97
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
98
+ return models::FP32SparseMobileNetV3Small(0.8f, threadpool);
99
+ });
100
+ }
101
+
102
+ static void FP16MobileNetV1(benchmark::State& state) {
103
+ End2EndBenchmark(state, models::FP16MobileNetV1);
104
+ }
105
+
106
+ static void FP16MobileNetV2(benchmark::State& state) {
107
+ End2EndBenchmark(state, models::FP16MobileNetV2);
108
+ }
109
+
110
+ static void FP16MobileNetV3Large(benchmark::State& state) {
111
+ End2EndBenchmark(state, models::FP16MobileNetV3Large);
112
+ }
113
+
114
+ static void FP16MobileNetV3Small(benchmark::State& state) {
115
+ End2EndBenchmark(state, models::FP16MobileNetV3Small);
116
+ }
117
+
118
+ static void FP16Sparse80MobileNetV1(benchmark::State& state) {
119
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
120
+ return models::FP16SparseMobileNetV1(0.8f, threadpool);
121
+ });
122
+ }
123
+
124
+ static void FP16Sparse80MobileNetV2(benchmark::State& state) {
125
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
126
+ return models::FP16SparseMobileNetV2(0.8f, threadpool);
127
+ });
128
+ }
129
+
130
+ static void FP16Sparse80MobileNetV3Large(benchmark::State& state) {
131
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
132
+ return models::FP16SparseMobileNetV3Large(0.8f, threadpool);
133
+ });
134
+ }
135
+
136
+ static void FP16Sparse80MobileNetV3Small(benchmark::State& state) {
137
+ End2EndBenchmark(state, [](pthreadpool_t threadpool) {
138
+ return models::FP16SparseMobileNetV3Small(0.8f, threadpool);
139
+ });
140
+ }
141
+
142
+ static void QC8MobileNetV1(benchmark::State& state) {
143
+ End2EndBenchmark(state, models::QC8MobileNetV1);
144
+ }
145
+
146
+ static void QC8MobileNetV2(benchmark::State& state) {
147
+ End2EndBenchmark(state, models::QC8MobileNetV2);
148
+ }
149
+
150
+ static void QS8MobileNetV1(benchmark::State& state) {
151
+ End2EndBenchmark(state, models::QS8MobileNetV1);
152
+ }
153
+
154
+ static void QS8MobileNetV2(benchmark::State& state) {
155
+ End2EndBenchmark(state, models::QS8MobileNetV2);
156
+ }
157
+
158
+ static void QU8MobileNetV1(benchmark::State& state) {
159
+ End2EndBenchmark(state, models::QU8MobileNetV1);
160
+ }
161
+
162
+ static void QU8MobileNetV2(benchmark::State& state) {
163
+ End2EndBenchmark(state, models::QU8MobileNetV2);
164
+ }
165
+
166
+ BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
167
+ BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
168
+ BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
169
+ BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
170
+
171
+ BENCHMARK(FP32Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
172
+ BENCHMARK(FP32Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
173
+ BENCHMARK(FP32Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
174
+ BENCHMARK(FP32Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
175
+
176
+ BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
177
+ BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
178
+ BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
179
+ BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
180
+
181
+ BENCHMARK(FP16Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
182
+ BENCHMARK(FP16Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
183
+ BENCHMARK(FP16Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
184
+ BENCHMARK(FP16Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
185
+
186
+ BENCHMARK(QC8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
187
+ BENCHMARK(QC8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
188
+
189
+ BENCHMARK(QS8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
190
+ BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
191
+
192
+ BENCHMARK(QU8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
193
+ BENCHMARK(QU8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
194
+
195
+ #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
196
+ BENCHMARK(FP32MobileNetV3SmallFused)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
197
+ #endif // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
198
+
199
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
200
+ BENCHMARK_MAIN();
201
+ #endif
bench/end2end.h ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #pragma once
7
+
8
+ #include <benchmark/benchmark.h>
9
+
10
+ #include <xnnpack/models.h>
11
+
12
+
13
+ #define BENCHMARK_FP16_END2END(benchmark_fn) \
14
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP16MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
15
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP16MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
16
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP16MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
17
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP16MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
18
+
19
+ #define BENCHMARK_FP32_END2END(benchmark_fn) \
20
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
21
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
22
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
23
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
24
+
25
+ #define BENCHMARK_FP32_END2END_JIT(benchmark_fn) \
26
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
27
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
28
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3LargeJit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
29
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3SmallJit)->Unit(benchmark::kMicrosecond)->UseRealTime();
30
+
31
+ #define BENCHMARK_QS8_END2END(benchmark_fn) \
32
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QS8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
33
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
34
+
35
+ #define BENCHMARK_QU8_END2END(benchmark_fn) \
36
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QU8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
37
+ BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QU8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
bench/f16-conv-hwc2chw.cc ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/dconv.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/conv.h>
22
+ #include <xnnpack/microfnptr.h>
23
+ #include <xnnpack/microparams-init.h>
24
+ #include <xnnpack/pack.h>
25
+
26
+
27
+ static void f16_conv_hwc2chw(benchmark::State& state,
28
+ xnn_f16_conv_hwc2chw_ukernel_fn conv,
29
+ uint32_t output_channels_tile,
30
+ xnn_init_f16_minmax_params_fn init_params,
31
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
32
+ {
33
+ if ((isa_check != nullptr) && !isa_check(state)) {
34
+ return;
35
+ }
36
+ const size_t input_height = state.range(0);
37
+ const size_t input_width = state.range(1);
38
+ const size_t output_channels = state.range(2);
39
+
40
+ std::random_device random_device;
41
+ auto rng = std::mt19937(random_device());
42
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
43
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
44
+
45
+ const size_t input_channels = 3;
46
+ const size_t kernel_size = 3;
47
+ const size_t padding = 1;
48
+ const size_t subsampling = 2;
49
+
50
+ const size_t output_height = (input_height + 2 * padding - kernel_size) / subsampling + 1;
51
+ const size_t output_width = (input_width + 2 * padding - kernel_size) / subsampling + 1;
52
+
53
+ std::vector<uint16_t> input(input_height * input_width * input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
54
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
55
+ std::vector<uint16_t> kernel(output_channels * kernel_size * kernel_size * input_channels);
56
+ std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
57
+ std::vector<uint16_t> bias(output_channels);
58
+ std::generate(bias.begin(), bias.end(), std::ref(f16rng));
59
+
60
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> zero(input_channels * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
61
+
62
+ const size_t weights_elements = (kernel_size * kernel_size * input_channels + 1) *
63
+ benchmark::utils::RoundUp<size_t>(output_channels, output_channels_tile);
64
+ const size_t output_elements = output_height * output_width * output_channels;
65
+ const size_t num_buffers = 1 +
66
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
67
+ sizeof(uint16_t) * (weights_elements + output_elements));
68
+
69
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(weights_elements * num_buffers);
70
+ std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
71
+ xnn_pack_f16_dconv_oki_w(
72
+ output_channels, input_channels, output_channels_tile,
73
+ kernel_size /* kernel height */, kernel_size /* kernel width */,
74
+ kernel.data(), bias.data(), packed_weights.data(), nullptr);
75
+ for (size_t n = 1; n < num_buffers; n++) {
76
+ std::copy(packed_weights.cbegin(),
77
+ packed_weights.cbegin() + weights_elements,
78
+ packed_weights.begin() + n * weights_elements);
79
+ }
80
+
81
+ std::vector<uint16_t> output(output_elements * num_buffers);
82
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
83
+
84
+ xnn_f16_minmax_params params;
85
+ init_params(&params, 0x7C00 /* inf */, 0xFC00 /* -inf */);
86
+
87
+ size_t buffer_index = 0;
88
+ for (auto _ : state) {
89
+ state.PauseTiming();
90
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
91
+ buffer_index = (buffer_index + 1) % num_buffers;
92
+ state.ResumeTiming();
93
+
94
+ conv(
95
+ input_height, input_width,
96
+ 0 /* output_y_start */, output_height /* output_y_end */,
97
+ input.data(), zero.data(),
98
+ packed_weights.data() + buffer_index * weights_elements,
99
+ output.data() + buffer_index * output_elements,
100
+ padding, output_channels,
101
+ output_channels * output_width * sizeof(uint16_t),
102
+ output_channels * sizeof(uint16_t),
103
+ &params);
104
+ }
105
+
106
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
107
+ if (cpu_frequency != 0) {
108
+ state.counters["cpufreq"] = cpu_frequency;
109
+ }
110
+
111
+ state.counters["FLOPS"] = benchmark::Counter(
112
+ uint64_t(state.iterations()) * 2 *
113
+ output_height * output_width *
114
+ input_channels * output_channels *
115
+ kernel_size * kernel_size,
116
+ benchmark::Counter::kIsRate);
117
+ }
118
+
119
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
120
+ static void f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2(benchmark::State& state, const char* net) {
121
+ f16_conv_hwc2chw(state, xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2, 4,
122
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
123
+ }
124
+
125
+ BENCHMARK_DCONV(f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2);
126
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
127
+
128
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
129
+ BENCHMARK_MAIN();
130
+ #endif
bench/f16-dwconv-e2e.cc ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <cstring>
9
+ #include <functional>
10
+ #include <memory>
11
+ #include <random>
12
+ #include <vector>
13
+
14
+ #include "bench/end2end.h"
15
+ #include "bench/utils.h"
16
+ #include <benchmark/benchmark.h>
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/config.h>
20
+ #include <xnnpack/dwconv.h>
21
+ #include <xnnpack/microfnptr.h>
22
+ #include <xnnpack/microparams-init.h>
23
+ #include <xnnpack/models.h>
24
+
25
+
26
+ static void DWConvEnd2EndBenchmark(
27
+ benchmark::State& state,
28
+ models::ExecutionPlanFactory model_factory,
29
+ xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv_minmax,
30
+ xnn_init_f16_minmax_params_fn init_params,
31
+ uint8_t channel_tile, uint8_t primary_tile,
32
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
33
+ {
34
+ if (isa_check != nullptr && !isa_check(state)) {
35
+ return;
36
+ }
37
+ if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
38
+ state.SkipWithError("failed to initialize XNNPACK");
39
+ return;
40
+ }
41
+
42
+ struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
43
+ if (dwconv_config == nullptr) {
44
+ state.SkipWithError("hardware does not support F16 DWCONV");
45
+ return;
46
+ }
47
+
48
+ // Save dwconv_config so that we can modify it for the benchmark and later restore it.
49
+ struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
50
+ memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
51
+
52
+ // Override microkernels chosen in xnn_initialize
53
+ for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
54
+ // Replace only the microkernel with the matching kernel size.
55
+ if (dwconv_config[i].primary_tile == primary_tile) {
56
+ std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
57
+
58
+ // Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
59
+ dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv_minmax);
60
+ dwconv_config[i].channel_tile = channel_tile;
61
+ dwconv_config[i].channel_subtile = channel_tile;
62
+ dwconv_config[i].channel_round = 1;
63
+ dwconv_config[i].primary_tile = primary_tile;
64
+ dwconv_config[i].init.f16 = init_params;
65
+ break;
66
+ }
67
+ }
68
+
69
+ auto execution_plan = model_factory(nullptr);
70
+ if (execution_plan.empty()) {
71
+ state.SkipWithError("failed to create a model");
72
+ return;
73
+ }
74
+
75
+ for (auto _ : state) {
76
+ for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
77
+ xnn_status status = xnn_run_operator(op.get(), nullptr);
78
+ if (status != xnn_status_success) {
79
+ state.SkipWithError("failed to run a model");
80
+ return;
81
+ }
82
+ }
83
+ }
84
+
85
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
86
+ if (cpu_frequency != 0) {
87
+ state.counters["cpufreq"] = cpu_frequency;
88
+ }
89
+
90
+ // Restore dwconv_config to original state as defined in init.c.
91
+ memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
92
+ }
93
+
94
+ static void DWConvEnd2EndBenchmark(
95
+ benchmark::State& state,
96
+ models::ExecutionPlanFactory model_factory,
97
+ xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv_minmax,
98
+ xnn_init_f16_minmax_params_fn init_params,
99
+ uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round,
100
+ uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile,
101
+ uint8_t primary_tile_to_replace,
102
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
103
+ {
104
+ if (isa_check != nullptr && !isa_check(state)) {
105
+ return;
106
+ }
107
+
108
+ struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
109
+ if (dwconv_config == nullptr) {
110
+ state.SkipWithError("failed to initialize f16 DWCONV config");
111
+ return;
112
+ }
113
+
114
+ // Save dwconv_convig so that we can modify it for the benchmark and later restore it.
115
+ struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
116
+ memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
117
+
118
+ bool found = false;
119
+ for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
120
+ if (dwconv_config[i].primary_tile == primary_tile_to_replace) {
121
+ found = true;
122
+ } else if (dwconv_config[i].last_tile != 0) {
123
+ // Found a multipass microkernel, replace it.
124
+ found = true;
125
+ }
126
+ }
127
+
128
+ if (!found) {
129
+ state.SkipWithError("can't replace with multipass");
130
+ return;
131
+ }
132
+
133
+ // Override microkernels chosen in xnn_initialize
134
+ for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
135
+ // Replace only the microkernel with the matching kernel size.
136
+ if (dwconv_config[i].primary_tile == primary_tile_to_replace ||
137
+ dwconv_config[i].last_tile != 0) {
138
+ // Replace either when the primary_tile_to_replace matches, or replace the
139
+ // first multipass dwconv microkernel we find.
140
+ // TODO(zhin): support specifying target multipass dwconv to replace.
141
+ std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
142
+
143
+ // Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
144
+ dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv_minmax);
145
+ dwconv_config[i].channel_tile = channel_tile;
146
+ dwconv_config[i].channel_subtile = channel_subtile;
147
+ dwconv_config[i].channel_round = channel_round;
148
+ dwconv_config[i].primary_tile = primary_tile;
149
+ dwconv_config[i].middle_tile = middle_tile;
150
+ dwconv_config[i].last_tile = last_tile;
151
+ dwconv_config[i].init.f16 = init_params;
152
+ break;
153
+ }
154
+ }
155
+
156
+ auto execution_plan = model_factory(nullptr);
157
+ if (execution_plan.empty()) {
158
+ state.SkipWithError("failed to create a model");
159
+ return;
160
+ }
161
+
162
+ for (auto _ : state) {
163
+ for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
164
+ xnn_status status = xnn_run_operator(op.get(), nullptr);
165
+ if (status != xnn_status_success) {
166
+ state.SkipWithError("failed to run a model");
167
+ return;
168
+ }
169
+ }
170
+ }
171
+
172
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
173
+ if (cpu_frequency != 0) {
174
+ state.counters["cpufreq"] = cpu_frequency;
175
+ }
176
+
177
+ memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
178
+ }
179
+
180
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
181
+ static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
182
+ DWConvEnd2EndBenchmark(
183
+ state, model,
184
+ xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
185
+ /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
186
+ }
187
+ static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
188
+ DWConvEnd2EndBenchmark(
189
+ state, model,
190
+ xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
191
+ /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
192
+ }
193
+ static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
194
+ DWConvEnd2EndBenchmark(
195
+ state, model,
196
+ xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
197
+ /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
198
+ }
199
+ static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
200
+ DWConvEnd2EndBenchmark(
201
+ state, model,
202
+ xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
203
+ /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
204
+ }
205
+ static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
206
+ DWConvEnd2EndBenchmark(
207
+ state, model,
208
+ xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
209
+ /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
210
+ }
211
+ static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
212
+ DWConvEnd2EndBenchmark(
213
+ state, model,
214
+ xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
215
+ /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
216
+ }
217
+
218
+ static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
219
+ DWConvEnd2EndBenchmark(
220
+ state, model,
221
+ xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
222
+ /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
223
+ }
224
+ static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
225
+ DWConvEnd2EndBenchmark(
226
+ state, model,
227
+ xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
228
+ /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
229
+ }
230
+ static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
231
+ DWConvEnd2EndBenchmark(
232
+ state, model,
233
+ xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
234
+ /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
235
+ }
236
+ static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
237
+ DWConvEnd2EndBenchmark(
238
+ state, model,
239
+ xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
240
+ /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
241
+ }
242
+ static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
243
+ DWConvEnd2EndBenchmark(
244
+ state, model,
245
+ xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
246
+ /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
247
+ }
248
+ static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
249
+ DWConvEnd2EndBenchmark(
250
+ state, model,
251
+ xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
252
+ /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
253
+ }
254
+
255
+ static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
256
+ DWConvEnd2EndBenchmark(
257
+ state, model,
258
+ xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
259
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
260
+ }
261
+ static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
262
+ DWConvEnd2EndBenchmark(
263
+ state, model,
264
+ xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
265
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
266
+ }
267
+ static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
268
+ DWConvEnd2EndBenchmark(
269
+ state, model,
270
+ xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
271
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
272
+ }
273
+ static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
274
+ DWConvEnd2EndBenchmark(
275
+ state, model,
276
+ xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
277
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
278
+ }
279
+ static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
280
+ DWConvEnd2EndBenchmark(
281
+ state, model,
282
+ xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
283
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
284
+ }
285
+ static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
286
+ DWConvEnd2EndBenchmark(
287
+ state, model,
288
+ xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
289
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
290
+ }
291
+
292
+ static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
293
+ DWConvEnd2EndBenchmark(
294
+ state, model,
295
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
296
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
297
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
298
+ /*primary_tile_to_replace=*/25,
299
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
300
+ }
301
+ static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
302
+ DWConvEnd2EndBenchmark(
303
+ state, model,
304
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
305
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
306
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
307
+ /*primary_tile_to_replace=*/25,
308
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
309
+ }
310
+ static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
311
+ DWConvEnd2EndBenchmark(
312
+ state, model,
313
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
314
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
315
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
316
+ /*primary_tile_to_replace=*/25,
317
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
318
+ }
319
+ static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
320
+ DWConvEnd2EndBenchmark(
321
+ state, model,
322
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
323
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
324
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
325
+ /*primary_tile_to_replace=*/25,
326
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
327
+ }
328
+ static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
329
+ DWConvEnd2EndBenchmark(
330
+ state, model,
331
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
332
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
333
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
334
+ /*primary_tile_to_replace=*/25,
335
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
336
+ }
337
+ static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
338
+ DWConvEnd2EndBenchmark(
339
+ state, model,
340
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
341
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
342
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
343
+ /*primary_tile_to_replace=*/25,
344
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
345
+ }
346
+
347
+ static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
348
+ DWConvEnd2EndBenchmark(
349
+ state, model,
350
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
351
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
352
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
353
+ /*primary_tile_to_replace=*/25,
354
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
355
+ }
356
+ static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
357
+ DWConvEnd2EndBenchmark(
358
+ state, model,
359
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
360
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
361
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
362
+ /*primary_tile_to_replace=*/25,
363
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
364
+ }
365
+ static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
366
+ DWConvEnd2EndBenchmark(
367
+ state, model,
368
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
369
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
370
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
371
+ /*primary_tile_to_replace=*/25,
372
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
373
+ }
374
+ static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
375
+ DWConvEnd2EndBenchmark(
376
+ state, model,
377
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
378
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
379
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
380
+ /*primary_tile_to_replace=*/25,
381
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
382
+ }
383
+ static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
384
+ DWConvEnd2EndBenchmark(
385
+ state, model,
386
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
387
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
388
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
389
+ /*primary_tile_to_replace=*/25,
390
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
391
+ }
392
+ static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
393
+ DWConvEnd2EndBenchmark(
394
+ state, model,
395
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
396
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
397
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
398
+ /*primary_tile_to_replace=*/25,
399
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
400
+ }
401
+
402
+ static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
403
+ DWConvEnd2EndBenchmark(
404
+ state, model,
405
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
406
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
407
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
408
+ /*primary_tile_to_replace=*/25,
409
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
410
+ }
411
+ static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
412
+ DWConvEnd2EndBenchmark(
413
+ state, model,
414
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
415
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
416
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
417
+ /*primary_tile_to_replace=*/25,
418
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
419
+ }
420
+ static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
421
+ DWConvEnd2EndBenchmark(
422
+ state, model,
423
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
424
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
425
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
426
+ /*primary_tile_to_replace=*/25,
427
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
428
+ }
429
+ static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
430
+ DWConvEnd2EndBenchmark(
431
+ state, model,
432
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
433
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
434
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
435
+ /*primary_tile_to_replace=*/25,
436
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
437
+ }
438
+ static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
439
+ DWConvEnd2EndBenchmark(
440
+ state, model,
441
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
442
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
443
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
444
+ /*primary_tile_to_replace=*/25,
445
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
446
+ }
447
+ static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
448
+ DWConvEnd2EndBenchmark(
449
+ state, model,
450
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
451
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
452
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
453
+ /*primary_tile_to_replace=*/25,
454
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
455
+ }
456
+
457
+ BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith);
458
+ BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith_acc2);
459
+ BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith);
460
+ BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith_acc2);
461
+ BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith);
462
+ BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith_acc2);
463
+
464
+ BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith);
465
+ BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith_acc2);
466
+ BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith);
467
+ BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith_acc2);
468
+ BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith);
469
+ BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith_acc2);
470
+
471
+ BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith);
472
+ BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith_acc2);
473
+ BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith);
474
+ BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith_acc2);
475
+ BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith);
476
+ BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith_acc2);
477
+
478
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
479
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
480
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
481
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
482
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
483
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
484
+
485
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
486
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
487
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
488
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
489
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
490
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
491
+
492
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
493
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
494
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
495
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
496
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
497
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
498
+
499
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
500
+
501
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
502
+ static void f16_dwconv_25p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
503
+ DWConvEnd2EndBenchmark(
504
+ state, model,
505
+ xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
506
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
507
+ }
508
+ static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
509
+ DWConvEnd2EndBenchmark(
510
+ state, model,
511
+ xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
512
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
513
+ }
514
+ static void f16_dwconv_25p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
515
+ DWConvEnd2EndBenchmark(
516
+ state, model,
517
+ xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
518
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
519
+ }
520
+ static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
521
+ DWConvEnd2EndBenchmark(
522
+ state, model,
523
+ xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
524
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
525
+ }
526
+ static void f16_dwconv_25p32c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
527
+ DWConvEnd2EndBenchmark(
528
+ state, model,
529
+ xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
530
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
531
+ }
532
+ static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
533
+ DWConvEnd2EndBenchmark(
534
+ state, model,
535
+ xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
536
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
537
+ }
538
+
539
+ static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
540
+ DWConvEnd2EndBenchmark(
541
+ state, model,
542
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
543
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
544
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
545
+ /*primary_tile_to_replace=*/25,
546
+ /*isa_check=*/benchmark::utils::CheckFMA3);
547
+ }
548
+ static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
549
+ DWConvEnd2EndBenchmark(
550
+ state, model,
551
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
552
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
553
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
554
+ /*primary_tile_to_replace=*/25,
555
+ /*isa_check=*/benchmark::utils::CheckFMA3);
556
+ }
557
+ static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
558
+ DWConvEnd2EndBenchmark(
559
+ state, model,
560
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
561
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
562
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
563
+ /*primary_tile_to_replace=*/25,
564
+ /*isa_check=*/benchmark::utils::CheckFMA3);
565
+ }
566
+ static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
567
+ DWConvEnd2EndBenchmark(
568
+ state, model,
569
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
570
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
571
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
572
+ /*primary_tile_to_replace=*/25,
573
+ /*isa_check=*/benchmark::utils::CheckFMA3);
574
+ }
575
+ static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
576
+ DWConvEnd2EndBenchmark(
577
+ state, model,
578
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
579
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
580
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
581
+ /*primary_tile_to_replace=*/25,
582
+ /*isa_check=*/benchmark::utils::CheckFMA3);
583
+ }
584
+ static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
585
+ DWConvEnd2EndBenchmark(
586
+ state, model,
587
+ xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
588
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
589
+ /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
590
+ /*primary_tile_to_replace=*/25,
591
+ /*isa_check=*/benchmark::utils::CheckFMA3);
592
+ }
593
+
594
+ static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
595
+ DWConvEnd2EndBenchmark(
596
+ state, model,
597
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
598
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
599
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
600
+ /*primary_tile_to_replace=*/25,
601
+ /*isa_check=*/benchmark::utils::CheckFMA3);
602
+ }
603
+ static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
604
+ DWConvEnd2EndBenchmark(
605
+ state, model,
606
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
607
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
608
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
609
+ /*primary_tile_to_replace=*/25,
610
+ /*isa_check=*/benchmark::utils::CheckFMA3);
611
+ }
612
+ static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
613
+ DWConvEnd2EndBenchmark(
614
+ state, model,
615
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
616
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
617
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
618
+ /*primary_tile_to_replace=*/25,
619
+ /*isa_check=*/benchmark::utils::CheckFMA3);
620
+ }
621
+ static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
622
+ DWConvEnd2EndBenchmark(
623
+ state, model,
624
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
625
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
626
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
627
+ /*primary_tile_to_replace=*/25,
628
+ /*isa_check=*/benchmark::utils::CheckFMA3);
629
+ }
630
+ static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
631
+ DWConvEnd2EndBenchmark(
632
+ state, model,
633
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
634
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
635
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
636
+ /*primary_tile_to_replace=*/25,
637
+ /*isa_check=*/benchmark::utils::CheckFMA3);
638
+ }
639
+ static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
640
+ DWConvEnd2EndBenchmark(
641
+ state, model,
642
+ xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
643
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
644
+ /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
645
+ /*primary_tile_to_replace=*/25,
646
+ /*isa_check=*/benchmark::utils::CheckFMA3);
647
+ }
648
+
649
+ static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
650
+ DWConvEnd2EndBenchmark(
651
+ state, model,
652
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
653
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
654
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
655
+ /*primary_tile_to_replace=*/25,
656
+ /*isa_check=*/benchmark::utils::CheckFMA3);
657
+ }
658
+ static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
659
+ DWConvEnd2EndBenchmark(
660
+ state, model,
661
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
662
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
663
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
664
+ /*primary_tile_to_replace=*/25,
665
+ /*isa_check=*/benchmark::utils::CheckFMA3);
666
+ }
667
+ static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
668
+ DWConvEnd2EndBenchmark(
669
+ state, model,
670
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
671
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
672
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
673
+ /*primary_tile_to_replace=*/25,
674
+ /*isa_check=*/benchmark::utils::CheckFMA3);
675
+ }
676
+ static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
677
+ DWConvEnd2EndBenchmark(
678
+ state, model,
679
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
680
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
681
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
682
+ /*primary_tile_to_replace=*/25,
683
+ /*isa_check=*/benchmark::utils::CheckFMA3);
684
+ }
685
+ static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
686
+ DWConvEnd2EndBenchmark(
687
+ state, model,
688
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
689
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
690
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
691
+ /*primary_tile_to_replace=*/25,
692
+ /*isa_check=*/benchmark::utils::CheckFMA3);
693
+ }
694
+ static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
695
+ DWConvEnd2EndBenchmark(
696
+ state, model,
697
+ xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
698
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
699
+ /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
700
+ /*primary_tile_to_replace=*/25,
701
+ /*isa_check=*/benchmark::utils::CheckFMA3);
702
+ }
703
+
704
+ BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3)
705
+ BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3_acc2)
706
+ BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3)
707
+ BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3_acc2)
708
+ BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3)
709
+ BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3_acc2)
710
+
711
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3)
712
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
713
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3)
714
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
715
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3)
716
+ BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
717
+
718
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3)
719
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
720
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3)
721
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
722
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3)
723
+ BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
724
+
725
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3)
726
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
727
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3)
728
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
729
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3)
730
+ BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
731
+
732
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
733
+
734
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
735
+ BENCHMARK_MAIN();
736
+ #endif
bench/f16-dwconv.cc ADDED
@@ -0,0 +1,795 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/dwconv.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/dwconv.h>
22
+ #include <xnnpack/indirection.h>
23
+ #include <xnnpack/microfnptr.h>
24
+ #include <xnnpack/microkernel-utils.h>
25
+ #include <xnnpack/microparams-init.h>
26
+ #include <xnnpack/operator.h>
27
+ #include <xnnpack/pack.h>
28
+
29
+
30
+ static void f16_dwconv(benchmark::State& state,
31
+ xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv,
32
+ xnn_init_f16_minmax_params_fn init_params,
33
+ uint32_t channel_tile, uint32_t primary_tile,
34
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
35
+ {
36
+ if (isa_check != nullptr && !isa_check(state)) {
37
+ return;
38
+ }
39
+
40
+ const size_t input_height = state.range(0);
41
+ const size_t input_width = state.range(1);
42
+ const size_t kernel_height = state.range(2);
43
+ const size_t kernel_width = state.range(3);
44
+ const size_t padding_height = state.range(4);
45
+ const size_t padding_width = state.range(5);
46
+ const size_t subsampling = state.range(6);
47
+ const size_t dilation = state.range(7);
48
+ const size_t channels = state.range(8);
49
+
50
+ const size_t kernel_size = kernel_height * kernel_width;
51
+ if (kernel_size > primary_tile) {
52
+ state.SkipWithError("kernel size mismatch");
53
+ return;
54
+ }
55
+
56
+ std::random_device random_device;
57
+ auto rng = std::mt19937(random_device());
58
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
59
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
60
+
61
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
62
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
63
+ const size_t padding_left = padding_width / 2;
64
+ const size_t padding_top = padding_height / 2;
65
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
66
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
67
+ const size_t output_size = output_height * output_width;
68
+ const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
69
+ const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
70
+
71
+ const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
72
+
73
+ std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
74
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
75
+ std::vector<uint16_t> k(channels * kernel_height * kernel_width);
76
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
77
+ std::vector<uint16_t> b(channels);
78
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
79
+
80
+ std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
81
+
82
+ const size_t w_elements = (kernel_size + 1) * c_stride;
83
+ // Can read (primary_tile - kernel_size) elements after end of indirection buffer.
84
+ const size_t i_elements = (primary_tile - kernel_size) + output_height * step_height;
85
+ const size_t c_elements = output_size * channels;
86
+ const size_t num_buffers = 1 +
87
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
88
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
89
+
90
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
91
+ std::fill(w.begin(), w.end(), UINT16_C(0));
92
+ xnn_pack_f16_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels,
93
+ channel_tile, channel_tile, /*channel_round=*/1,
94
+ k.data(), b.data(), w.data(),
95
+ /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
96
+ for (size_t n = 1; n < num_buffers; n++) {
97
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
98
+ }
99
+
100
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
101
+ xnn_operator convolution_op = { };
102
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
103
+ convolution_op.input = a.data();
104
+ convolution_op.input_pixel_stride = channels;
105
+ convolution_op.zero_buffer = z.data();
106
+ convolution_op.input_height = input_height;
107
+ convolution_op.input_width = input_width;
108
+ convolution_op.output_height = output_height;
109
+ convolution_op.output_width = output_width;
110
+ convolution_op.kernel_height = kernel_height;
111
+ convolution_op.kernel_width = kernel_width;
112
+ convolution_op.stride_height = subsampling;
113
+ convolution_op.stride_width = subsampling;
114
+ convolution_op.dilation_height = dilation;
115
+ convolution_op.dilation_width = dilation;
116
+ convolution_op.padding_top = padding_top;
117
+ convolution_op.padding_left = padding_left;
118
+
119
+ xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, primary_tile, XNN_LOG2_SIZEOF_HALF);
120
+ for (size_t n = 1; n < num_buffers; n++) {
121
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
122
+ }
123
+
124
+ std::vector<uint16_t> c(c_elements * num_buffers);
125
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
126
+
127
+ xnn_f16_minmax_params params;
128
+ init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
129
+
130
+ size_t buffer_index = 0;
131
+ for (auto _ : state) {
132
+ state.PauseTiming();
133
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
134
+ buffer_index = (buffer_index + 1) % num_buffers;
135
+ state.ResumeTiming();
136
+
137
+ for (size_t y = 0; y < output_height; y++) {
138
+ dwconv(channels, output_width,
139
+ reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
140
+ w.data() + buffer_index * w_elements,
141
+ c.data() + buffer_index * c_elements + y * output_width * channels,
142
+ kernel_height * step_width * sizeof(void*), 0,
143
+ 0, z.data(), &params);
144
+ }
145
+ }
146
+
147
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
148
+ if (cpu_frequency != 0) {
149
+ state.counters["cpufreq"] = cpu_frequency;
150
+ }
151
+
152
+ state.counters["FLOPS"] = benchmark::Counter(
153
+ uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
154
+
155
+ state.counters["bytes"] = benchmark::Counter(
156
+ uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
157
+ benchmark::Counter::kIsRate);
158
+ }
159
+
160
+ static void f16_dwconv(benchmark::State& state,
161
+ xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv,
162
+ xnn_init_f16_minmax_params_fn init_params,
163
+ uint32_t first_pass_tile,
164
+ uint32_t middle_pass_tile,
165
+ uint32_t last_pass_tile,
166
+ uint32_t channel_tile,
167
+ uint32_t channel_subtile,
168
+ uint32_t channel_round,
169
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
170
+ {
171
+ if (isa_check != nullptr && !isa_check(state)) {
172
+ return;
173
+ }
174
+
175
+ const size_t input_height = state.range(0);
176
+ const size_t input_width = state.range(1);
177
+ const size_t kernel_height = state.range(2);
178
+ const size_t kernel_width = state.range(3);
179
+ const size_t padding_height = state.range(4);
180
+ const size_t padding_width = state.range(5);
181
+ const size_t subsampling = state.range(6);
182
+ const size_t dilation = state.range(7);
183
+ const size_t channels = state.range(8);
184
+
185
+ const size_t kernel_size = kernel_height * kernel_width;
186
+
187
+ if (kernel_size <= first_pass_tile) {
188
+ state.SkipWithError("kernel size mismatch");
189
+ return;
190
+ }
191
+
192
+ std::random_device random_device;
193
+ auto rng = std::mt19937(random_device());
194
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
195
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
196
+
197
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
198
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
199
+ const size_t padding_left = padding_width / 2;
200
+ const size_t padding_top = padding_height / 2;
201
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
202
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
203
+ const size_t output_size = output_height * output_width;
204
+ const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
205
+ const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
206
+
207
+ std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
208
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
209
+ std::vector<uint16_t> k(channels * kernel_size);
210
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
211
+ std::vector<uint16_t> b(channels);
212
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
213
+
214
+ std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
215
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(channels + XNN_MULTIPASS_EXTRA_BYTES / sizeof(uint16_t));
216
+
217
+ const size_t tile_size = xnn_dwconv_multipass_tile_size(
218
+ kernel_size, first_pass_tile, middle_pass_tile, last_pass_tile);
219
+ const size_t w_elements =
220
+ xnn_dwconv_multipass_weights_size(
221
+ tile_size, channels, channel_tile, channel_subtile, channel_round, /*bias_element_size=*/sizeof(uint16_t),
222
+ /*log2_filter_element_size=*/1, /*extra_weights_byte=*/0) /
223
+ sizeof(uint16_t);
224
+ // Can read (primary_tile - kernel_size) elements after end of indirection buffer.
225
+ const size_t i_elements = tile_size - kernel_size + output_height * step_height;
226
+ const size_t c_elements = output_size * channels;
227
+ const size_t num_buffers = 1 +
228
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
229
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
230
+
231
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
232
+ std::fill(w.begin(), w.end(), UINT16_C(0));
233
+ xnn_pack_f16_dwconv_ghw_w(
234
+ first_pass_tile, middle_pass_tile, last_pass_tile,
235
+ kernel_height, kernel_width,
236
+ channels, channel_tile, channel_subtile, channel_round,
237
+ k.data(), b.data(), w.data(), /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
238
+ for (size_t n = 1; n < num_buffers; n++) {
239
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
240
+ }
241
+
242
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
243
+ xnn_operator convolution_op = { };
244
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
245
+ convolution_op.input = a.data();
246
+ convolution_op.input_pixel_stride = channels;
247
+ convolution_op.zero_buffer = z.data();
248
+ convolution_op.input_height = input_height;
249
+ convolution_op.input_width = input_width;
250
+ convolution_op.output_height = output_height;
251
+ convolution_op.output_width = output_width;
252
+ convolution_op.kernel_height = kernel_height;
253
+ convolution_op.kernel_width = kernel_width;
254
+ convolution_op.stride_height = subsampling;
255
+ convolution_op.stride_width = subsampling;
256
+ convolution_op.dilation_height = dilation;
257
+ convolution_op.dilation_width = dilation;
258
+ convolution_op.padding_top = padding_top;
259
+ convolution_op.padding_left = padding_left;
260
+
261
+ xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, tile_size, XNN_LOG2_SIZEOF_HALF);
262
+ for (size_t n = 1; n < num_buffers; n++) {
263
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
264
+ }
265
+
266
+ std::vector<uint16_t> c(c_elements * num_buffers);
267
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
268
+
269
+ xnn_f16_minmax_params params;
270
+ init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
271
+
272
+ const int input_advanced = tile_size - last_pass_tile;
273
+ const int input_stride_elements = kernel_height * step_width - input_advanced;
274
+ size_t buffer_index = 0;
275
+ for (auto _ : state) {
276
+ state.PauseTiming();
277
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
278
+ buffer_index = (buffer_index + 1) % num_buffers;
279
+ state.ResumeTiming();
280
+
281
+ for (size_t y = 0; y < output_height; y++) {
282
+ dwconv(channels, output_width,
283
+ reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
284
+ w.data() + buffer_index * w_elements,
285
+ c.data() + buffer_index * c_elements + y * output_width * channels,
286
+ input_stride_elements * sizeof(void*), 0,
287
+ 0, z.data(), kernel_size, buffer.data(), &params);
288
+ }
289
+ }
290
+
291
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
292
+ if (cpu_frequency != 0) {
293
+ state.counters["cpufreq"] = cpu_frequency;
294
+ }
295
+
296
+ state.counters["FLOPS"] = benchmark::Counter(
297
+ uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
298
+
299
+ state.counters["bytes"] = benchmark::Counter(
300
+ uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
301
+ benchmark::Counter::kIsRate);
302
+ }
303
+
304
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
305
+ static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
306
+ f16_dwconv(state,
307
+ xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2,
308
+ xnn_init_f16_minmax_fp16arith_params,
309
+ 8, 4, benchmark::utils::CheckNEONFP16ARITH);
310
+ }
311
+
312
+ static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, const char* net) {
313
+ f16_dwconv(state,
314
+ xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith,
315
+ xnn_init_f16_minmax_fp16arith_params,
316
+ 8, 4, benchmark::utils::CheckNEONFP16ARITH);
317
+ }
318
+
319
+ static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
320
+ f16_dwconv(state,
321
+ xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2,
322
+ xnn_init_f16_minmax_fp16arith_params,
323
+ 8, 9, benchmark::utils::CheckNEONFP16ARITH);
324
+ }
325
+
326
+ static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, const char* net) {
327
+ f16_dwconv(state,
328
+ xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith,
329
+ xnn_init_f16_minmax_fp16arith_params,
330
+ 8, 9, benchmark::utils::CheckNEONFP16ARITH);
331
+ }
332
+
333
+ static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
334
+ f16_dwconv(state,
335
+ xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2,
336
+ xnn_init_f16_minmax_fp16arith_params,
337
+ 8, 25, benchmark::utils::CheckNEONFP16ARITH);
338
+ }
339
+
340
+ static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, const char* net) {
341
+ f16_dwconv(state,
342
+ xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith,
343
+ xnn_init_f16_minmax_fp16arith_params,
344
+ 8, 25, benchmark::utils::CheckNEONFP16ARITH);
345
+ }
346
+
347
+ static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
348
+ f16_dwconv(state,
349
+ xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2,
350
+ xnn_init_f16_minmax_fp16arith_params,
351
+ 16, 4, benchmark::utils::CheckNEONFP16ARITH);
352
+ }
353
+
354
+ static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, const char* net) {
355
+ f16_dwconv(state,
356
+ xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith,
357
+ xnn_init_f16_minmax_fp16arith_params,
358
+ 16, 4, benchmark::utils::CheckNEONFP16ARITH);
359
+ }
360
+
361
+ static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
362
+ f16_dwconv(state,
363
+ xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2,
364
+ xnn_init_f16_minmax_fp16arith_params,
365
+ 16, 9, benchmark::utils::CheckNEONFP16ARITH);
366
+ }
367
+
368
+ static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, const char* net) {
369
+ f16_dwconv(state,
370
+ xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith,
371
+ xnn_init_f16_minmax_fp16arith_params,
372
+ 16, 9, benchmark::utils::CheckNEONFP16ARITH);
373
+ }
374
+
375
+ static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
376
+ f16_dwconv(state,
377
+ xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2,
378
+ xnn_init_f16_minmax_fp16arith_params,
379
+ 16, 25, benchmark::utils::CheckNEONFP16ARITH);
380
+ }
381
+
382
+ static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, const char* net) {
383
+ f16_dwconv(state,
384
+ xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith,
385
+ xnn_init_f16_minmax_fp16arith_params,
386
+ 16, 25, benchmark::utils::CheckNEONFP16ARITH);
387
+ }
388
+
389
+ static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
390
+ f16_dwconv(state,
391
+ xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2,
392
+ xnn_init_f16_minmax_fp16arith_params,
393
+ 32, 4, benchmark::utils::CheckNEONFP16ARITH);
394
+ }
395
+
396
+ static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, const char* net) {
397
+ f16_dwconv(state,
398
+ xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith,
399
+ xnn_init_f16_minmax_fp16arith_params,
400
+ 32, 4, benchmark::utils::CheckNEONFP16ARITH);
401
+ }
402
+
403
+ static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
404
+ f16_dwconv(state,
405
+ xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2,
406
+ xnn_init_f16_minmax_fp16arith_params,
407
+ 32, 9, benchmark::utils::CheckNEONFP16ARITH);
408
+ }
409
+
410
+ static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, const char* net) {
411
+ f16_dwconv(state,
412
+ xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith,
413
+ xnn_init_f16_minmax_fp16arith_params,
414
+ 32, 9, benchmark::utils::CheckNEONFP16ARITH);
415
+ }
416
+
417
+ static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
418
+ f16_dwconv(state,
419
+ xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2,
420
+ xnn_init_f16_minmax_fp16arith_params,
421
+ 32, 25, benchmark::utils::CheckNEONFP16ARITH);
422
+ }
423
+
424
+ static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, const char* net) {
425
+ f16_dwconv(state,
426
+ xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith,
427
+ xnn_init_f16_minmax_fp16arith_params,
428
+ 32, 25, benchmark::utils::CheckNEONFP16ARITH);
429
+ }
430
+
431
+ static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
432
+ f16_dwconv(
433
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
434
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
435
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
436
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
437
+ }
438
+ static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
439
+ f16_dwconv(
440
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
441
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
442
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
443
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
444
+ }
445
+ static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
446
+ f16_dwconv(
447
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
448
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
449
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
450
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
451
+ }
452
+ static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
453
+ f16_dwconv(
454
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
455
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
456
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
457
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
458
+ }
459
+ static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
460
+ f16_dwconv(
461
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
462
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
463
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
464
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
465
+ }
466
+ static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
467
+ f16_dwconv(
468
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
469
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
470
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
471
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
472
+ }
473
+
474
+ static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
475
+ f16_dwconv(
476
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
477
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
478
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
479
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
480
+ }
481
+ static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
482
+ f16_dwconv(
483
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
484
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
485
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
486
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
487
+ }
488
+ static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
489
+ f16_dwconv(
490
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
491
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
492
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
493
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
494
+ }
495
+ static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
496
+ f16_dwconv(
497
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
498
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
499
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
500
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
501
+ }
502
+ static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
503
+ f16_dwconv(
504
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
505
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
506
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
507
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
508
+ }
509
+ static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
510
+ f16_dwconv(
511
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
512
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
513
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
514
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
515
+ }
516
+
517
+ static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
518
+ f16_dwconv(
519
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
520
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
521
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
522
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
523
+ }
524
+ static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
525
+ f16_dwconv(
526
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
527
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
528
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
529
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
530
+ }
531
+ static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
532
+ f16_dwconv(
533
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
534
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
535
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
536
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
537
+ }
538
+ static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
539
+ f16_dwconv(
540
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
541
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
542
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
543
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
544
+ }
545
+ static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
546
+ f16_dwconv(
547
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
548
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
549
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
550
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
551
+ }
552
+ static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
553
+ f16_dwconv(
554
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
555
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
556
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
557
+ /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
558
+ }
559
+
560
+ BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith_acc2)
561
+ BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith)
562
+ BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith_acc2)
563
+ BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith)
564
+ BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith_acc2)
565
+ BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith)
566
+ BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith_acc2)
567
+ BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith)
568
+ BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith_acc2)
569
+ BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith)
570
+ BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith_acc2)
571
+ BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith)
572
+ BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith_acc2)
573
+ BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith)
574
+ BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith_acc2)
575
+ BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith)
576
+ BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith_acc2)
577
+ BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith)
578
+
579
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
580
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
581
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
582
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
583
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
584
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
585
+
586
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
587
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
588
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
589
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
590
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
591
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
592
+
593
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
594
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
595
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
596
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
597
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
598
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
599
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
600
+
601
+
602
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
603
+ static void f16_dwconv_25p8c__fma3(benchmark::State& state, const char* net) {
604
+ f16_dwconv(
605
+ state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
606
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
607
+ }
608
+ static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, const char* net) {
609
+ f16_dwconv(
610
+ state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
611
+ /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
612
+ }
613
+ static void f16_dwconv_25p16c__fma3(benchmark::State& state, const char* net) {
614
+ f16_dwconv(
615
+ state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
616
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
617
+ }
618
+ static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, const char* net) {
619
+ f16_dwconv(
620
+ state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
621
+ /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
622
+ }
623
+ static void f16_dwconv_25p32c__fma3(benchmark::State& state, const char* net) {
624
+ f16_dwconv(
625
+ state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
626
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
627
+ }
628
+ static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, const char* net) {
629
+ f16_dwconv(
630
+ state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
631
+ /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
632
+ }
633
+
634
+ static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, const char* net) {
635
+ f16_dwconv(
636
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
637
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
638
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
639
+ /*isa_check=*/benchmark::utils::CheckFMA3);
640
+ }
641
+ static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
642
+ f16_dwconv(
643
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
644
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
645
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
646
+ /*isa_check=*/benchmark::utils::CheckFMA3);
647
+ }
648
+ static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, const char* net) {
649
+ f16_dwconv(
650
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
651
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
652
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
653
+ /*isa_check=*/benchmark::utils::CheckFMA3);
654
+ }
655
+ static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
656
+ f16_dwconv(
657
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
658
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
659
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
660
+ /*isa_check=*/benchmark::utils::CheckFMA3);
661
+ }
662
+ static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, const char* net) {
663
+ f16_dwconv(
664
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
665
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
666
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
667
+ /*isa_check=*/benchmark::utils::CheckFMA3);
668
+ }
669
+ static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
670
+ f16_dwconv(
671
+ state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
672
+ /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
673
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
674
+ /*isa_check=*/benchmark::utils::CheckFMA3);
675
+ }
676
+
677
+ static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, const char* net) {
678
+ f16_dwconv(
679
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
680
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
681
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
682
+ /*isa_check=*/benchmark::utils::CheckFMA3);
683
+ }
684
+ static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
685
+ f16_dwconv(
686
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
687
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
688
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
689
+ /*isa_check=*/benchmark::utils::CheckFMA3);
690
+ }
691
+ static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, const char* net) {
692
+ f16_dwconv(
693
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
694
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
695
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
696
+ /*isa_check=*/benchmark::utils::CheckFMA3);
697
+ }
698
+ static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
699
+ f16_dwconv(
700
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
701
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
702
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
703
+ /*isa_check=*/benchmark::utils::CheckFMA3);
704
+ }
705
+ static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, const char* net) {
706
+ f16_dwconv(
707
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
708
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
709
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
710
+ /*isa_check=*/benchmark::utils::CheckFMA3);
711
+ }
712
+ static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
713
+ f16_dwconv(
714
+ state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
715
+ /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
716
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
717
+ /*isa_check=*/benchmark::utils::CheckFMA3);
718
+ }
719
+
720
+ static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, const char* net) {
721
+ f16_dwconv(
722
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
723
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
724
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
725
+ /*isa_check=*/benchmark::utils::CheckFMA3);
726
+ }
727
+ static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
728
+ f16_dwconv(
729
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
730
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
731
+ /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
732
+ /*isa_check=*/benchmark::utils::CheckFMA3);
733
+ }
734
+ static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, const char* net) {
735
+ f16_dwconv(
736
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
737
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
738
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
739
+ /*isa_check=*/benchmark::utils::CheckFMA3);
740
+ }
741
+ static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
742
+ f16_dwconv(
743
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
744
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
745
+ /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
746
+ /*isa_check=*/benchmark::utils::CheckFMA3);
747
+ }
748
+ static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, const char* net) {
749
+ f16_dwconv(
750
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
751
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
752
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
753
+ /*isa_check=*/benchmark::utils::CheckFMA3);
754
+ }
755
+ static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
756
+ f16_dwconv(
757
+ state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
758
+ /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
759
+ /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
760
+ /*isa_check=*/benchmark::utils::CheckFMA3);
761
+ }
762
+
763
+ BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3)
764
+ BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3_acc2)
765
+ BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3)
766
+ BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3_acc2)
767
+ BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3)
768
+ BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3_acc2)
769
+
770
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3)
771
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
772
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3)
773
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
774
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3)
775
+ BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
776
+
777
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3)
778
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
779
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3)
780
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
781
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3)
782
+ BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
783
+
784
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3)
785
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
786
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3)
787
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
788
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3)
789
+ BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
790
+
791
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
792
+
793
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
794
+ BENCHMARK_MAIN();
795
+ #endif
bench/f16-dwconv2d-chw.cc ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/dwconv.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/dwconv.h>
22
+ #include <xnnpack/indirection.h>
23
+ #include <xnnpack/microfnptr.h>
24
+ #include <xnnpack/microparams-init.h>
25
+ #include <xnnpack/operator.h>
26
+ #include <xnnpack/pack.h>
27
+
28
+
29
+ static void f16_dwconv2d_chw(benchmark::State& state,
30
+ xnn_f16_dwconv2d_chw_ukernel_fn dwconv,
31
+ xnn_init_f16_chw_params_fn init_params,
32
+ uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s,
33
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
34
+ {
35
+ if ((isa_check != nullptr) && !isa_check(state)) {
36
+ return;
37
+ }
38
+
39
+ const size_t input_height = state.range(0);
40
+ const size_t input_width = state.range(1);
41
+ const size_t kernel_height = state.range(2);
42
+ const size_t kernel_width = state.range(3);
43
+ const size_t padding_height = state.range(4);
44
+ const size_t padding_width = state.range(5);
45
+ const size_t subsampling = state.range(6);
46
+ const size_t dilation = state.range(7);
47
+ const size_t channels = state.range(8);
48
+
49
+ if (kernel_height != kh) {
50
+ state.SkipWithError("kernel height mismatch");
51
+ return;
52
+ }
53
+
54
+ if (kernel_width != kw) {
55
+ state.SkipWithError("kernel width mismatch");
56
+ return;
57
+ }
58
+
59
+ if (subsampling != s) {
60
+ state.SkipWithError("subsampling mismatch");
61
+ return;
62
+ }
63
+
64
+ if (padding_width % 2 != 0 || padding_width / 2 != pw) {
65
+ state.SkipWithError("padding width mismatch");
66
+ return;
67
+ }
68
+
69
+ if (dilation != 1) {
70
+ state.SkipWithError("unsupported dilation");
71
+ return;
72
+ }
73
+
74
+ std::random_device random_device;
75
+ auto rng = std::mt19937(random_device());
76
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
77
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
78
+
79
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
80
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
81
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
82
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
83
+
84
+ const size_t inputSize = (input_height + padding_height) * input_width;
85
+ const size_t kernel_size = kernel_height * kernel_width;
86
+ const size_t output_size = output_height * output_width;
87
+
88
+ std::vector<uint16_t> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
89
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
90
+ std::vector<uint16_t> bias(channels);
91
+ std::generate(bias.begin(), bias.end(), std::ref(f16rng));
92
+ std::vector<uint16_t> kernel(channels * kernel_size);
93
+ std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
94
+ std::vector<uint16_t> zero(input_width + padding_width);
95
+
96
+ const size_t w_elements = (kernel_size + 1) * channels;
97
+ const size_t o_elements = output_size * channels;
98
+ const size_t num_buffers = 1 +
99
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
100
+ sizeof(uint16_t) * (w_elements + o_elements));
101
+
102
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(w_elements * num_buffers);
103
+ std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
104
+ for (size_t c = 0; c < channels; c++) {
105
+ packed_weights[c * kernel_size + c] = bias[c];
106
+ for (size_t i = 0; i < kernel_size; i++) {
107
+ packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
108
+ }
109
+ }
110
+ for (size_t n = 1; n < num_buffers; n++) {
111
+ std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
112
+ }
113
+
114
+ std::vector<uint16_t> output(o_elements * num_buffers);
115
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
116
+
117
+ xnn_f16_chw_params chw_params;
118
+ init_params(&chw_params,
119
+ input_width, 0xFC00 /* -inf */, 0x7C00 /* inf */);
120
+
121
+ size_t buffer_index = 0;
122
+ for (auto _ : state) {
123
+ state.PauseTiming();
124
+ benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
125
+ buffer_index = (buffer_index + 1) % num_buffers;
126
+ state.ResumeTiming();
127
+
128
+ for (uint32_t channel = 0; channel < channels; channel++) {
129
+ dwconv(
130
+ input_height, input_width * sizeof(uint16_t),
131
+ input.data() + channel * inputSize,
132
+ packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
133
+ zero.data(),
134
+ output.data() + channel * output_size + buffer_index * o_elements,
135
+ padding_height / 2, // padding_top
136
+ &chw_params);
137
+ }
138
+ }
139
+
140
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
141
+ if (cpu_frequency != 0) {
142
+ state.counters["cpufreq"] = cpu_frequency;
143
+ }
144
+
145
+ state.counters["FLOPS"] = benchmark::Counter(
146
+ uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
147
+ benchmark::Counter::kIsRate);
148
+
149
+ state.counters["bytes"] = benchmark::Counter(
150
+ uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
151
+ benchmark::Counter::kIsRate);
152
+ }
153
+
154
+
155
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
156
+ static void dwconv2d_chw_3x3p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
157
+ f16_dwconv2d_chw(state,
158
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8,
159
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
160
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
161
+ benchmark::utils::CheckNEONFP16ARITH);
162
+ }
163
+ static void dwconv2d_chw_3x3p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
164
+ f16_dwconv2d_chw(state,
165
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
166
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
167
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
168
+ benchmark::utils::CheckNEONFP16ARITH);
169
+ }
170
+ static void dwconv2d_chw_3x3p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
171
+ f16_dwconv2d_chw(state,
172
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_3x8,
173
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
174
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
175
+ benchmark::utils::CheckNEONFP16ARITH);
176
+ }
177
+ static void dwconv2d_chw_3x3p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
178
+ f16_dwconv2d_chw(state,
179
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_4x8,
180
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
181
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
182
+ benchmark::utils::CheckNEONFP16ARITH);
183
+ }
184
+ static void dwconv2d_chw_3x3p1__neonfp16arith_5x8(benchmark::State& state, const char* net) {
185
+ f16_dwconv2d_chw(state,
186
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_5x8,
187
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
188
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
189
+ benchmark::utils::CheckNEONFP16ARITH);
190
+ }
191
+ static void dwconv2d_chw_3x3p1__neonfp16arith_6x8(benchmark::State& state, const char* net) {
192
+ f16_dwconv2d_chw(state,
193
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_6x8,
194
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
195
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
196
+ benchmark::utils::CheckNEONFP16ARITH);
197
+ }
198
+ static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
199
+ f16_dwconv2d_chw(state,
200
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc2,
201
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
202
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
203
+ benchmark::utils::CheckNEONFP16ARITH);
204
+ }
205
+ static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
206
+ f16_dwconv2d_chw(state,
207
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc3,
208
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
209
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
210
+ benchmark::utils::CheckNEONFP16ARITH);
211
+ }
212
+ static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
213
+ f16_dwconv2d_chw(state,
214
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc4,
215
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
216
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
217
+ benchmark::utils::CheckNEONFP16ARITH);
218
+ }
219
+ static void dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
220
+ f16_dwconv2d_chw(state,
221
+ xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8_acc2,
222
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
223
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
224
+ benchmark::utils::CheckNEONFP16ARITH);
225
+ }
226
+
227
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
228
+ f16_dwconv2d_chw(state,
229
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8,
230
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
231
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
232
+ benchmark::utils::CheckNEONFP16ARITH);
233
+ }
234
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
235
+ f16_dwconv2d_chw(state,
236
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8,
237
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
238
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
239
+ benchmark::utils::CheckNEONFP16ARITH);
240
+ }
241
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
242
+ f16_dwconv2d_chw(state,
243
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x8,
244
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
245
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
246
+ benchmark::utils::CheckNEONFP16ARITH);
247
+ }
248
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
249
+ f16_dwconv2d_chw(state,
250
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_4x8,
251
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
252
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
253
+ benchmark::utils::CheckNEONFP16ARITH);
254
+ }
255
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
256
+ f16_dwconv2d_chw(state,
257
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc2,
258
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
259
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
260
+ benchmark::utils::CheckNEONFP16ARITH);
261
+ }
262
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
263
+ f16_dwconv2d_chw(state,
264
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc3,
265
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
266
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
267
+ benchmark::utils::CheckNEONFP16ARITH);
268
+ }
269
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
270
+ f16_dwconv2d_chw(state,
271
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc4,
272
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
273
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
274
+ benchmark::utils::CheckNEONFP16ARITH);
275
+ }
276
+ static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
277
+ f16_dwconv2d_chw(state,
278
+ xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8_acc2,
279
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
280
+ 3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
281
+ benchmark::utils::CheckNEONFP16ARITH);
282
+ }
283
+
284
+ static void dwconv2d_chw_5x5p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
285
+ f16_dwconv2d_chw(state,
286
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8,
287
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
288
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
289
+ benchmark::utils::CheckNEONFP16ARITH);
290
+ }
291
+ static void dwconv2d_chw_5x5p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
292
+ f16_dwconv2d_chw(state,
293
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8,
294
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
295
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
296
+ benchmark::utils::CheckNEONFP16ARITH);
297
+ }
298
+ static void dwconv2d_chw_5x5p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
299
+ f16_dwconv2d_chw(state,
300
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8,
301
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
302
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
303
+ benchmark::utils::CheckNEONFP16ARITH);
304
+ }
305
+ static void dwconv2d_chw_5x5p2__neonfp16arith_4x8(benchmark::State& state, const char* net) {
306
+ f16_dwconv2d_chw(state,
307
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8,
308
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
309
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
310
+ benchmark::utils::CheckNEONFP16ARITH);
311
+ }
312
+ static void dwconv2d_chw_5x5p2__neonfp16arith_5x8(benchmark::State& state, const char* net) {
313
+ f16_dwconv2d_chw(state,
314
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_5x8,
315
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
316
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
317
+ benchmark::utils::CheckNEONFP16ARITH);
318
+ }
319
+ static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
320
+ f16_dwconv2d_chw(state,
321
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc2,
322
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
323
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
324
+ benchmark::utils::CheckNEONFP16ARITH);
325
+ }
326
+ static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
327
+ f16_dwconv2d_chw(state,
328
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc3,
329
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
330
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
331
+ benchmark::utils::CheckNEONFP16ARITH);
332
+ }
333
+ static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
334
+ f16_dwconv2d_chw(state,
335
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc4,
336
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
337
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
338
+ benchmark::utils::CheckNEONFP16ARITH);
339
+ }
340
+ static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
341
+ f16_dwconv2d_chw(state,
342
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc5,
343
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
344
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
345
+ benchmark::utils::CheckNEONFP16ARITH);
346
+ }
347
+ static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
348
+ f16_dwconv2d_chw(state,
349
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc2,
350
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
351
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
352
+ benchmark::utils::CheckNEONFP16ARITH);
353
+ }
354
+ static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
355
+ f16_dwconv2d_chw(state,
356
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc3,
357
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
358
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
359
+ benchmark::utils::CheckNEONFP16ARITH);
360
+ }
361
+ static void dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
362
+ f16_dwconv2d_chw(state,
363
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8_acc2,
364
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
365
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
366
+ benchmark::utils::CheckNEONFP16ARITH);
367
+ }
368
+ static void dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2(benchmark::State& state, const char* net) {
369
+ f16_dwconv2d_chw(state,
370
+ xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8_acc2,
371
+ xnn_init_f16_chw_neonfp16arith_stride1_params,
372
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
373
+ benchmark::utils::CheckNEONFP16ARITH);
374
+ }
375
+
376
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
377
+ f16_dwconv2d_chw(state,
378
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8,
379
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
380
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
381
+ benchmark::utils::CheckNEONFP16ARITH);
382
+ }
383
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
384
+ f16_dwconv2d_chw(state,
385
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8,
386
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
387
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
388
+ benchmark::utils::CheckNEONFP16ARITH);
389
+ }
390
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
391
+ f16_dwconv2d_chw(state,
392
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_3x8,
393
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
394
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
395
+ benchmark::utils::CheckNEONFP16ARITH);
396
+ }
397
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
398
+ f16_dwconv2d_chw(state,
399
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc2,
400
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
401
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
402
+ benchmark::utils::CheckNEONFP16ARITH);
403
+ }
404
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
405
+ f16_dwconv2d_chw(state,
406
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc3,
407
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
408
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
409
+ benchmark::utils::CheckNEONFP16ARITH);
410
+ }
411
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
412
+ f16_dwconv2d_chw(state,
413
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc4,
414
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
415
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
416
+ benchmark::utils::CheckNEONFP16ARITH);
417
+ }
418
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
419
+ f16_dwconv2d_chw(state,
420
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
421
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
422
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
423
+ benchmark::utils::CheckNEONFP16ARITH);
424
+ }
425
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
426
+ f16_dwconv2d_chw(state,
427
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc2,
428
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
429
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
430
+ benchmark::utils::CheckNEONFP16ARITH);
431
+ }
432
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
433
+ f16_dwconv2d_chw(state,
434
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc3,
435
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
436
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
437
+ benchmark::utils::CheckNEONFP16ARITH);
438
+ }
439
+ static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
440
+ f16_dwconv2d_chw(state,
441
+ xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
442
+ xnn_init_f16_chw_neonfp16arith_stride2_params,
443
+ 5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
444
+ benchmark::utils::CheckNEONFP16ARITH);
445
+ }
446
+
447
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8)
448
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8)
449
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_3x8)
450
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_4x8)
451
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_5x8)
452
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_6x8)
453
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2)
454
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3)
455
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4)
456
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2)
457
+
458
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8)
459
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8)
460
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_3x8)
461
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_4x8)
462
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2)
463
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3)
464
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4)
465
+ BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2)
466
+
467
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8)
468
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8)
469
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8)
470
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8)
471
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_5x8)
472
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2)
473
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3)
474
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4)
475
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5)
476
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2)
477
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3)
478
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2)
479
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2)
480
+
481
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8)
482
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8)
483
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8)
484
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2)
485
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3)
486
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4)
487
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5)
488
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2)
489
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3)
490
+ BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2)
491
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
492
+
493
+
494
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
495
+ BENCHMARK_MAIN();
496
+ #endif
bench/f16-f32-vcvt.cc ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2021 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vcvt.h>
22
+
23
+
24
+ static void f16_f32_vcvt(
25
+ benchmark::State& state,
26
+ xnn_f16_f32_vcvt_ukernel_fn cvt,
27
+ xnn_init_f16_f32_cvt_params_fn init_params = nullptr,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t num_elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(uint16_t));
42
+ std::vector<float, AlignedAllocator<float, 64>> y(num_elements);
43
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
44
+ std::fill(y.begin(), y.end(), std::nanf(""));
45
+
46
+ xnn_f16_f32_cvt_params params;
47
+ if (init_params != nullptr) {
48
+ init_params(&params);
49
+ }
50
+ for (auto _ : state) {
51
+ cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
52
+ }
53
+
54
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
55
+ if (cpu_frequency != 0) {
56
+ state.counters["cpufreq"] = cpu_frequency;
57
+ }
58
+
59
+ const size_t elements_per_iteration = num_elements;
60
+ state.counters["elements"] =
61
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
62
+
63
+ const size_t bytes_per_iteration = num_elements * (sizeof(uint16_t) + sizeof(float));
64
+ state.counters["bytes"] =
65
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
66
+ }
67
+
68
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
69
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x8,
70
+ xnn_f16_f32_vcvt_ukernel__neonfp16_x8,
71
+ nullptr /* init params */,
72
+ benchmark::utils::CheckNEONFP16)
73
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
74
+ ->UseRealTime();
75
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x16,
76
+ xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
77
+ nullptr /* init params */,
78
+ benchmark::utils::CheckNEONFP16)
79
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
80
+ ->UseRealTime();
81
+
82
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x8,
83
+ xnn_f16_f32_vcvt_ukernel__neon_int16_x8,
84
+ xnn_init_f16_f32_cvt_neon_params,
85
+ benchmark::utils::CheckNEON)
86
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
87
+ ->UseRealTime();
88
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x16,
89
+ xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
90
+ xnn_init_f16_f32_cvt_neon_params,
91
+ benchmark::utils::CheckNEON)
92
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
93
+ ->UseRealTime();
94
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x24,
95
+ xnn_f16_f32_vcvt_ukernel__neon_int16_x24,
96
+ xnn_init_f16_f32_cvt_neon_params,
97
+ benchmark::utils::CheckNEON)
98
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
99
+ ->UseRealTime();
100
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x32,
101
+ xnn_f16_f32_vcvt_ukernel__neon_int16_x32,
102
+ xnn_init_f16_f32_cvt_neon_params,
103
+ benchmark::utils::CheckNEON)
104
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
105
+ ->UseRealTime();
106
+
107
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x8,
108
+ xnn_f16_f32_vcvt_ukernel__neon_int32_x8,
109
+ xnn_init_f16_f32_cvt_neon_params,
110
+ benchmark::utils::CheckNEON)
111
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
112
+ ->UseRealTime();
113
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x16,
114
+ xnn_f16_f32_vcvt_ukernel__neon_int32_x16,
115
+ xnn_init_f16_f32_cvt_neon_params,
116
+ benchmark::utils::CheckNEON)
117
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
118
+ ->UseRealTime();
119
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x24,
120
+ xnn_f16_f32_vcvt_ukernel__neon_int32_x24,
121
+ xnn_init_f16_f32_cvt_neon_params,
122
+ benchmark::utils::CheckNEON)
123
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
124
+ ->UseRealTime();
125
+ BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x32,
126
+ xnn_f16_f32_vcvt_ukernel__neon_int32_x32,
127
+ xnn_init_f16_f32_cvt_neon_params,
128
+ benchmark::utils::CheckNEON)
129
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
130
+ ->UseRealTime();
131
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
132
+
133
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
134
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x16,
135
+ xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
136
+ nullptr /* init params */,
137
+ benchmark::utils::CheckAVX512SKX)
138
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
139
+ ->UseRealTime();
140
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x32,
141
+ xnn_f16_f32_vcvt_ukernel__avx512skx_x32,
142
+ nullptr /* init params */,
143
+ benchmark::utils::CheckAVX512SKX)
144
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
145
+ ->UseRealTime();
146
+
147
+ BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x8,
148
+ xnn_f16_f32_vcvt_ukernel__f16c_x8,
149
+ nullptr /* init params */,
150
+ benchmark::utils::CheckF16C)
151
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
152
+ ->UseRealTime();
153
+ BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x16,
154
+ xnn_f16_f32_vcvt_ukernel__f16c_x16,
155
+ nullptr /* init params */,
156
+ benchmark::utils::CheckF16C)
157
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
158
+ ->UseRealTime();
159
+
160
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x8,
161
+ xnn_f16_f32_vcvt_ukernel__avx_int16_x8,
162
+ xnn_init_f16_f32_cvt_sse_int16_params,
163
+ benchmark::utils::CheckAVX)
164
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
165
+ ->UseRealTime();
166
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x16,
167
+ xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
168
+ xnn_init_f16_f32_cvt_sse_int16_params,
169
+ benchmark::utils::CheckAVX)
170
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
171
+ ->UseRealTime();
172
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x24,
173
+ xnn_f16_f32_vcvt_ukernel__avx_int16_x24,
174
+ xnn_init_f16_f32_cvt_sse_int16_params,
175
+ benchmark::utils::CheckAVX)
176
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
177
+ ->UseRealTime();
178
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x32,
179
+ xnn_f16_f32_vcvt_ukernel__avx_int16_x32,
180
+ xnn_init_f16_f32_cvt_sse_int16_params,
181
+ benchmark::utils::CheckAVX)
182
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
183
+ ->UseRealTime();
184
+
185
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x8,
186
+ xnn_f16_f32_vcvt_ukernel__avx_int32_x8,
187
+ xnn_init_f16_f32_cvt_sse_int32_params,
188
+ benchmark::utils::CheckAVX)
189
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
190
+ ->UseRealTime();
191
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x16,
192
+ xnn_f16_f32_vcvt_ukernel__avx_int32_x16,
193
+ xnn_init_f16_f32_cvt_sse_int32_params,
194
+ benchmark::utils::CheckAVX)
195
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
196
+ ->UseRealTime();
197
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x24,
198
+ xnn_f16_f32_vcvt_ukernel__avx_int32_x24,
199
+ xnn_init_f16_f32_cvt_sse_int32_params,
200
+ benchmark::utils::CheckAVX)
201
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
202
+ ->UseRealTime();
203
+ BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x32,
204
+ xnn_f16_f32_vcvt_ukernel__avx_int32_x32,
205
+ xnn_init_f16_f32_cvt_sse_int32_params,
206
+ benchmark::utils::CheckAVX)
207
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
208
+ ->UseRealTime();
209
+
210
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x8,
211
+ xnn_f16_f32_vcvt_ukernel__sse41_int16_x8,
212
+ xnn_init_f16_f32_cvt_sse_int16_params,
213
+ benchmark::utils::CheckSSE41)
214
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
215
+ ->UseRealTime();
216
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x16,
217
+ xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
218
+ xnn_init_f16_f32_cvt_sse_int16_params,
219
+ benchmark::utils::CheckSSE41)
220
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
221
+ ->UseRealTime();
222
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x24,
223
+ xnn_f16_f32_vcvt_ukernel__sse41_int16_x24,
224
+ xnn_init_f16_f32_cvt_sse_int16_params,
225
+ benchmark::utils::CheckSSE41)
226
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
227
+ ->UseRealTime();
228
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x32,
229
+ xnn_f16_f32_vcvt_ukernel__sse41_int16_x32,
230
+ xnn_init_f16_f32_cvt_sse_int16_params,
231
+ benchmark::utils::CheckSSE41)
232
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
233
+ ->UseRealTime();
234
+
235
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x8,
236
+ xnn_f16_f32_vcvt_ukernel__sse41_int32_x8,
237
+ xnn_init_f16_f32_cvt_sse_int32_params,
238
+ benchmark::utils::CheckSSE41)
239
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
240
+ ->UseRealTime();
241
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x16,
242
+ xnn_f16_f32_vcvt_ukernel__sse41_int32_x16,
243
+ xnn_init_f16_f32_cvt_sse_int32_params,
244
+ benchmark::utils::CheckSSE41)
245
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
246
+ ->UseRealTime();
247
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x24,
248
+ xnn_f16_f32_vcvt_ukernel__sse41_int32_x24,
249
+ xnn_init_f16_f32_cvt_sse_int32_params,
250
+ benchmark::utils::CheckSSE41)
251
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
252
+ ->UseRealTime();
253
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x32,
254
+ xnn_f16_f32_vcvt_ukernel__sse41_int32_x32,
255
+ xnn_init_f16_f32_cvt_sse_int32_params,
256
+ benchmark::utils::CheckSSE41)
257
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
258
+ ->UseRealTime();
259
+
260
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x8,
261
+ xnn_f16_f32_vcvt_ukernel__sse2_int16_x8,
262
+ xnn_init_f16_f32_cvt_sse_int16_params)
263
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
264
+ ->UseRealTime();
265
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x16,
266
+ xnn_f16_f32_vcvt_ukernel__sse2_int16_x16,
267
+ xnn_init_f16_f32_cvt_sse_int16_params)
268
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
269
+ ->UseRealTime();
270
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x24,
271
+ xnn_f16_f32_vcvt_ukernel__sse2_int16_x24,
272
+ xnn_init_f16_f32_cvt_sse_int16_params)
273
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
274
+ ->UseRealTime();
275
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x32,
276
+ xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
277
+ xnn_init_f16_f32_cvt_sse_int16_params)
278
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
279
+ ->UseRealTime();
280
+
281
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x8,
282
+ xnn_f16_f32_vcvt_ukernel__sse2_int32_x8,
283
+ xnn_init_f16_f32_cvt_sse_int32_params)
284
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
285
+ ->UseRealTime();
286
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x16,
287
+ xnn_f16_f32_vcvt_ukernel__sse2_int32_x16,
288
+ xnn_init_f16_f32_cvt_sse_int32_params)
289
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
290
+ ->UseRealTime();
291
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x24,
292
+ xnn_f16_f32_vcvt_ukernel__sse2_int32_x24,
293
+ xnn_init_f16_f32_cvt_sse_int32_params)
294
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
295
+ ->UseRealTime();
296
+ BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x32,
297
+ xnn_f16_f32_vcvt_ukernel__sse2_int32_x32,
298
+ xnn_init_f16_f32_cvt_sse_int32_params)
299
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
300
+ ->UseRealTime();
301
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
302
+
303
+ #if XNN_ARCH_WASMRELAXEDSIMD
304
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x8,
305
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x8,
306
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
307
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
308
+ ->UseRealTime();
309
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x16,
310
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x16,
311
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
312
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
313
+ ->UseRealTime();
314
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x24,
315
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x24,
316
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
317
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
318
+ ->UseRealTime();
319
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x32,
320
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x32,
321
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
322
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
323
+ ->UseRealTime();
324
+
325
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x8,
326
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x8,
327
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
328
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
329
+ ->UseRealTime();
330
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x16,
331
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x16,
332
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
333
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
334
+ ->UseRealTime();
335
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x24,
336
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x24,
337
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
338
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
339
+ ->UseRealTime();
340
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x32,
341
+ xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x32,
342
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
343
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
344
+ ->UseRealTime();
345
+ #endif // XNN_ARCH_WASMRELAXEDSIMD
346
+
347
+ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
348
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x8,
349
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x8,
350
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
351
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
352
+ ->UseRealTime();
353
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x16,
354
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
355
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
356
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
357
+ ->UseRealTime();
358
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x24,
359
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x24,
360
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
361
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
362
+ ->UseRealTime();
363
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x32,
364
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x32,
365
+ xnn_init_f16_f32_cvt_wasmsimd_int16_params)
366
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
367
+ ->UseRealTime();
368
+
369
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x8,
370
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x8,
371
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
372
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
373
+ ->UseRealTime();
374
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x16,
375
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x16,
376
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
377
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
378
+ ->UseRealTime();
379
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x24,
380
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x24,
381
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
382
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
383
+ ->UseRealTime();
384
+ BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x32,
385
+ xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x32,
386
+ xnn_init_f16_f32_cvt_wasmsimd_int32_params)
387
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
388
+ ->UseRealTime();
389
+ #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
390
+
391
+ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x1,
392
+ xnn_f16_f32_vcvt_ukernel__scalar_x1,
393
+ xnn_init_f16_f32_cvt_scalar_params)
394
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
395
+ ->UseRealTime();
396
+ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x2,
397
+ xnn_f16_f32_vcvt_ukernel__scalar_x2,
398
+ xnn_init_f16_f32_cvt_scalar_params)
399
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
400
+ ->UseRealTime();
401
+ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x3,
402
+ xnn_f16_f32_vcvt_ukernel__scalar_x3,
403
+ xnn_init_f16_f32_cvt_scalar_params)
404
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
405
+ ->UseRealTime();
406
+ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x4,
407
+ xnn_f16_f32_vcvt_ukernel__scalar_x4,
408
+ xnn_init_f16_f32_cvt_scalar_params)
409
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
410
+ ->UseRealTime();
411
+
412
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
413
+ BENCHMARK_MAIN();
414
+ #endif
bench/f16-f32acc-gemm.cc ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #include <algorithm>
10
+ #include <cfloat>
11
+ #include <cmath>
12
+ #include <functional>
13
+ #include <random>
14
+ #include <vector>
15
+
16
+ #include <benchmark/benchmark.h>
17
+ #include <fp16/fp16.h>
18
+ #include "bench/gemm.h"
19
+ #include "bench/utils.h"
20
+
21
+ #include <xnnpack.h>
22
+ #include <xnnpack/aligned-allocator.h>
23
+ #include <xnnpack/common.h>
24
+ #include <xnnpack/gemm.h>
25
+ #include <xnnpack/math.h>
26
+ #include <xnnpack/pack.h>
27
+ #include <xnnpack/microfnptr.h>
28
+ #include <xnnpack/microparams-init.h>
29
+
30
+
31
+ static void f16_gemm(benchmark::State& state,
32
+ xnn_f16_gemm_minmax_ukernel_fn gemm,
33
+ size_t mr, size_t nr, size_t kr, size_t sr,
34
+ xnn_init_f16_minmax_params_fn init_params,
35
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
36
+ {
37
+ if (isa_check != nullptr && !isa_check(state)) {
38
+ return;
39
+ }
40
+
41
+ const size_t mc = state.range(0);
42
+ const size_t nc = state.range(1);
43
+ const size_t kc = state.range(2);
44
+
45
+ const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
46
+ const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
47
+
48
+ std::random_device random_device;
49
+ auto rng = std::mt19937(random_device());
50
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
51
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
52
+
53
+ std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
54
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
55
+ std::vector<uint16_t> k(nc * kc);
56
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
57
+ std::vector<uint16_t> b(nc);
58
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
59
+
60
+ const size_t w_elements = nc_stride * kc_stride + nc_stride;
61
+ const size_t c_elements = mc * nc;
62
+ const size_t num_buffers = 1 +
63
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
64
+ sizeof(uint16_t) * (w_elements + c_elements));
65
+
66
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
67
+ std::fill(w.begin(), w.end(), 0);
68
+ xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
69
+ std::vector<uint16_t> c(c_elements * num_buffers);
70
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
71
+
72
+ // Prepare minmax parameters.
73
+ xnn_f16_minmax_params params;
74
+ init_params(&params,
75
+ UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
76
+
77
+ size_t buffer_index = 0;
78
+ for (auto _ : state) {
79
+ // Use circular buffers (exceeding cache size) and prefetch to control cache state:
80
+ // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
81
+ // - W is not in cache (for any cache level)
82
+ // - C is not in cache (for any cache level)
83
+ state.PauseTiming();
84
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
85
+ buffer_index = (buffer_index + 1) % num_buffers;
86
+ state.ResumeTiming();
87
+
88
+ for (uint32_t m = 0; m < mc; m += mr) {
89
+ const uint32_t mb = min(mc - m, mr);
90
+ for (uint32_t n = 0; n < nc; n += nr) {
91
+ const uint32_t nb = min(nc - n, nr);
92
+ gemm(
93
+ mb, nb, kc * sizeof(uint16_t),
94
+ a.data() + m * kc, kc * sizeof(uint16_t),
95
+ w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
96
+ c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
97
+ &params);
98
+ }
99
+ }
100
+ }
101
+
102
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
103
+ if (cpu_frequency != 0) {
104
+ state.counters["cpufreq"] = cpu_frequency;
105
+ }
106
+
107
+ state.counters["FLOPS"] = benchmark::Counter(
108
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
109
+ }
110
+
111
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
112
+ static void f16_f32acc_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
113
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
114
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
115
+ }
116
+ static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
117
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
118
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
119
+ }
120
+ static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
121
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
122
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
123
+ }
124
+ static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
125
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
126
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
127
+ }
128
+ static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
129
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
130
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
131
+ }
132
+ static void f16_f32acc_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
133
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
134
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
135
+ }
136
+ static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
137
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
138
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
139
+ }
140
+ static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
141
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
142
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
143
+ }
144
+ static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
145
+ f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
146
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
147
+ }
148
+
149
+ BENCHMARK_GEMM(f16_f32acc_gemm_1x8__avx2_broadcast)
150
+ BENCHMARK_GEMM(f16_f32acc_gemm_4x8__avx2_broadcast)
151
+ BENCHMARK_GEMM(f16_f32acc_gemm_5x8__avx2_broadcast)
152
+ BENCHMARK_GEMM(f16_f32acc_gemm_6x8__avx2_broadcast)
153
+ BENCHMARK_GEMM(f16_f32acc_gemm_7x8__avx2_broadcast)
154
+ BENCHMARK_GEMM(f16_f32acc_gemm_1x16__avx2_broadcast)
155
+ BENCHMARK_GEMM(f16_f32acc_gemm_3x16__avx2_broadcast)
156
+ BENCHMARK_GEMM(f16_f32acc_gemm_4x16__avx2_broadcast)
157
+ BENCHMARK_GEMM(f16_f32acc_gemm_5x16__avx2_broadcast)
158
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
159
+
160
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
161
+ BENCHMARK_MAIN();
162
+ #endif
bench/f16-f32acc-igemm.cc ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/conv.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/igemm.h>
22
+ #include <xnnpack/indirection.h>
23
+ #include <xnnpack/microfnptr.h>
24
+ #include <xnnpack/microparams-init.h>
25
+ #include <xnnpack/operator.h>
26
+ #include <xnnpack/pack.h>
27
+
28
+
29
+ static void f16_igemm(benchmark::State& state,
30
+ xnn_f16_igemm_minmax_ukernel_fn igemm,
31
+ uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
32
+ xnn_init_f16_minmax_params_fn init_params,
33
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
34
+ {
35
+ if (isa_check != nullptr && !isa_check(state)) {
36
+ return;
37
+ }
38
+
39
+ const size_t input_height = state.range(0);
40
+ const size_t input_width = state.range(1);
41
+ const size_t kernel_height = state.range(2);
42
+ const size_t kernel_width = state.range(3);
43
+ const size_t kernel_size = kernel_height * kernel_width;
44
+ const size_t padding_height = state.range(4);
45
+ const size_t padding_width = state.range(5);
46
+ const size_t subsampling = state.range(6);
47
+ const size_t dilation = state.range(7);
48
+ const size_t group_input_channels = state.range(8);
49
+ const size_t group_output_channels = state.range(9);
50
+
51
+ std::random_device random_device;
52
+ auto rng = std::mt19937(random_device());
53
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
54
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
55
+
56
+ const size_t output_pixel_stride = group_output_channels;
57
+ const size_t input_pixel_stride = group_input_channels;
58
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60
+ const size_t padding_left = padding_width / 2;
61
+ const size_t padding_top = padding_height / 2;
62
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64
+ const size_t output_size = output_height * output_width;
65
+
66
+ const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
67
+ const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
68
+ const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
69
+
70
+ std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
71
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
72
+ std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
73
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
74
+ std::vector<uint16_t> b(group_output_channels);
75
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
76
+
77
+ std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
78
+
79
+ const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
80
+ const size_t i_elements = mc_stride * kernel_size;
81
+ const size_t c_elements = output_height * output_width * output_pixel_stride;
82
+ const size_t num_buffers = 1 +
83
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
84
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
85
+
86
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
87
+ std::fill(w.begin(), w.end(), 0);
88
+ xnn_pack_f16_conv_goki_w(
89
+ 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
90
+ nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
91
+ for (size_t n = 1; n < num_buffers; n++) {
92
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
93
+ }
94
+
95
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
96
+ xnn_operator convolution_op = { };
97
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
98
+ convolution_op.input = a.data();
99
+ convolution_op.input_pixel_stride = input_pixel_stride;
100
+ convolution_op.zero_buffer = z.data();
101
+ convolution_op.groups = 1;
102
+ convolution_op.group_input_channels = group_input_channels;
103
+ convolution_op.batch_size = 1;
104
+ convolution_op.input_height = input_height;
105
+ convolution_op.input_width = input_width;
106
+ convolution_op.output_height = output_height;
107
+ convolution_op.output_width = output_width;
108
+ convolution_op.kernel_height = kernel_height;
109
+ convolution_op.kernel_width = kernel_width;
110
+ convolution_op.stride_height = subsampling;
111
+ convolution_op.stride_width = subsampling;
112
+ convolution_op.dilation_height = dilation;
113
+ convolution_op.dilation_width = dilation;
114
+ convolution_op.padding_top = padding_top;
115
+ convolution_op.padding_left = padding_left;
116
+ xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
117
+ for (size_t n = 1; n < num_buffers; n++) {
118
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
119
+ }
120
+
121
+ std::vector<uint16_t> c(c_elements * num_buffers);
122
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
123
+
124
+ // Prepare minmax parameters.
125
+ xnn_f16_minmax_params params;
126
+ init_params(&params,
127
+ UINT16_C(0x7C00) /* inf */, UINT16_C(0xFC00) /* -inf */);
128
+
129
+ size_t buffer_index = 0;
130
+ for (auto _ : state) {
131
+ state.PauseTiming();
132
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
133
+ buffer_index = (buffer_index + 1) % num_buffers;
134
+ state.ResumeTiming();
135
+
136
+ for (uint32_t m = 0; m < output_size; m += mr) {
137
+ const uint32_t mb = min(output_size - m, mr);
138
+ for (uint32_t n = 0; n < group_output_channels; n += nr) {
139
+ const uint32_t nb = min(group_output_channels - n, nr);
140
+ igemm(
141
+ mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
142
+ reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
143
+ w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
144
+ c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
145
+ 0, z.data(), &params);
146
+ }
147
+ }
148
+ }
149
+
150
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
151
+ if (cpu_frequency != 0) {
152
+ state.counters["cpufreq"] = cpu_frequency;
153
+ }
154
+
155
+ state.counters["FLOPS"] = benchmark::Counter(
156
+ uint64_t(state.iterations()) * 2 *
157
+ output_height * output_width *
158
+ group_input_channels * group_output_channels *
159
+ kernel_height * kernel_width,
160
+ benchmark::Counter::kIsRate);
161
+ }
162
+
163
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
164
+ static void f16_f32acc_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
165
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
166
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
167
+ }
168
+ static void f16_f32acc_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
169
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
170
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
171
+ }
172
+ static void f16_f32acc_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
173
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
174
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
175
+ }
176
+ static void f16_f32acc_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
177
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
178
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
179
+ }
180
+ static void f16_f32acc_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
181
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
182
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
183
+ }
184
+ static void f16_f32acc_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
185
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
186
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
187
+ }
188
+ static void f16_f32acc_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
189
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
190
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
191
+ }
192
+ static void f16_f32acc_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
193
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
194
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
195
+ }
196
+ static void f16_f32acc_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
197
+ f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
198
+ xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
199
+ }
200
+
201
+ BENCHMARK_CONV(f16_f32acc_igemm_1x8__avx2_broadcast)
202
+ BENCHMARK_CONV(f16_f32acc_igemm_4x8__avx2_broadcast)
203
+ BENCHMARK_CONV(f16_f32acc_igemm_5x8__avx2_broadcast)
204
+ BENCHMARK_CONV(f16_f32acc_igemm_6x8__avx2_broadcast)
205
+ BENCHMARK_CONV(f16_f32acc_igemm_7x8__avx2_broadcast)
206
+ BENCHMARK_CONV(f16_f32acc_igemm_1x16__avx2_broadcast)
207
+ BENCHMARK_CONV(f16_f32acc_igemm_3x16__avx2_broadcast)
208
+ BENCHMARK_CONV(f16_f32acc_igemm_4x16__avx2_broadcast)
209
+ BENCHMARK_CONV(f16_f32acc_igemm_5x16__avx2_broadcast)
210
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
211
+
212
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
213
+ BENCHMARK_MAIN();
214
+ #endif
bench/f16-f32acc-rsum.cc ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/reduce.h>
22
+
23
+
24
+ static void f16_f32acc_rsum(
25
+ benchmark::State& state,
26
+ xnn_f16_f32acc_rsum_ukernel_fn rsum,
27
+ xnn_init_f16_f32acc_scale_params_fn init_params,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check != nullptr && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
42
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
43
+
44
+ xnn_f16_f32acc_scale_params params;
45
+ init_params(&params, /*scale=*/0.1f);
46
+
47
+ uint16_t output = UINT16_C(0x7E00); /* NaN */
48
+ for (auto _ : state) {
49
+ rsum(elements * sizeof(uint16_t), input.data(), &output, &params);
50
+ }
51
+
52
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
53
+ if (cpu_frequency != 0) {
54
+ state.counters["cpufreq"] = cpu_frequency;
55
+ }
56
+
57
+ const size_t elements_per_iteration = elements;
58
+ state.counters["elements"] =
59
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
60
+
61
+ const size_t bytes_per_iteration = elements * sizeof(uint16_t);
62
+ state.counters["bytes"] =
63
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
64
+ }
65
+
66
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
67
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x4,
68
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x4,
69
+ xnn_init_f16_f32acc_scale_scalar_params,
70
+ benchmark::utils::CheckNEONFP16)
71
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
72
+ ->UseRealTime();
73
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x8,
74
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x8,
75
+ xnn_init_f16_f32acc_scale_scalar_params,
76
+ benchmark::utils::CheckNEONFP16)
77
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
78
+ ->UseRealTime();
79
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x16_acc2,
80
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x16_acc2,
81
+ xnn_init_f16_f32acc_scale_scalar_params,
82
+ benchmark::utils::CheckNEONFP16)
83
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
84
+ ->UseRealTime();
85
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x24_acc3,
86
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x24_acc3,
87
+ xnn_init_f16_f32acc_scale_scalar_params,
88
+ benchmark::utils::CheckNEONFP16)
89
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
90
+ ->UseRealTime();
91
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc2,
92
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc2,
93
+ xnn_init_f16_f32acc_scale_scalar_params,
94
+ benchmark::utils::CheckNEONFP16)
95
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
96
+ ->UseRealTime();
97
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc4,
98
+ xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc4,
99
+ xnn_init_f16_f32acc_scale_scalar_params,
100
+ benchmark::utils::CheckNEONFP16)
101
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
102
+ ->UseRealTime();
103
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
104
+
105
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
106
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x8,
107
+ xnn_f16_f32acc_rsum_ukernel__f16c_x8,
108
+ xnn_init_f16_f32acc_scale_avx_params,
109
+ benchmark::utils::CheckF16C)
110
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
111
+ ->UseRealTime();
112
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x16_acc2,
113
+ xnn_f16_f32acc_rsum_ukernel__f16c_x16_acc2,
114
+ xnn_init_f16_f32acc_scale_avx_params,
115
+ benchmark::utils::CheckF16C)
116
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
117
+ ->UseRealTime();
118
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x24_acc3,
119
+ xnn_f16_f32acc_rsum_ukernel__f16c_x24_acc3,
120
+ xnn_init_f16_f32acc_scale_avx_params,
121
+ benchmark::utils::CheckF16C)
122
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
123
+ ->UseRealTime();
124
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc2,
125
+ xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc2,
126
+ xnn_init_f16_f32acc_scale_avx_params,
127
+ benchmark::utils::CheckF16C)
128
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
129
+ ->UseRealTime();
130
+ BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc4,
131
+ xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc4,
132
+ xnn_init_f16_f32acc_scale_avx_params,
133
+ benchmark::utils::CheckF16C)
134
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
135
+ ->UseRealTime();
136
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
137
+
138
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
139
+ BENCHMARK_MAIN();
140
+ #endif
bench/f16-gavgpool-cw.cc ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <numeric>
10
+ #include <vector>
11
+
12
+ #include "bench/utils.h"
13
+ #include <benchmark/benchmark.h>
14
+
15
+ #include <xnnpack.h>
16
+ #include <xnnpack/aligned-allocator.h>
17
+ #include <xnnpack/common.h>
18
+ #include <xnnpack/gavgpool.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+
22
+
23
+ void f16_gavgpool_cw(
24
+ benchmark::State& state,
25
+ xnn_f16_gavgpool_cw_ukernel_fn gavgpool_cw,
26
+ xnn_init_f16_gavgpool_neonfp16arith_params_fn init_params,
27
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
28
+ {
29
+ if (isa_check && !isa_check(state)) {
30
+ return;
31
+ }
32
+ const size_t channels = state.range(0);
33
+ const size_t elements = state.range(1);
34
+
35
+ std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(elements * channels + XNN_EXTRA_BYTES / sizeof(int16_t));
36
+ std::vector<int16_t> output(channels);
37
+ std::iota(input.begin(), input.end(), 0);
38
+
39
+ // Prepare parameters.
40
+ union xnn_f16_gavgpool_params params;
41
+ init_params(&params,
42
+ UINT16_C(0x3C00) /* scale */, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */, elements);
43
+
44
+ for (auto _ : state) {
45
+ gavgpool_cw(elements, channels, input.data(), output.data(), &params);
46
+ }
47
+
48
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49
+ if (cpu_frequency != 0) {
50
+ state.counters["cpufreq"] = cpu_frequency;
51
+ }
52
+ }
53
+
54
+ static void BenchmarkBatch(benchmark::internal::Benchmark* b)
55
+ {
56
+ b->ArgNames({"channels", "elements"});
57
+ b->Args({1, 1024});
58
+ b->Args({2, 1024});
59
+ b->Args({4, 1024});
60
+ b->Args({6, 1024});
61
+ b->Args({8, 1024});
62
+ b->Args({16, 1024});
63
+ b->Args({1024, 1024});
64
+ }
65
+
66
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
67
+ BENCHMARK_CAPTURE(f16_gavgpool_cw, f16_neon_x8,
68
+ xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8,
69
+ xnn_init_f16_gavgpool_neonfp16arith_params,
70
+ benchmark::utils::CheckNEONFP16ARITH)
71
+ ->Apply(BenchmarkBatch)
72
+ ->UseRealTime();
73
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
74
+
75
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
76
+ BENCHMARK_MAIN();
77
+ #endif
bench/f16-gemm-e2e.cc ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <cstring>
9
+ #include <functional>
10
+ #include <memory>
11
+ #include <random>
12
+ #include <vector>
13
+
14
+ #include "bench/end2end.h"
15
+ #include "bench/utils.h"
16
+ #include <benchmark/benchmark.h>
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/config.h>
20
+ #include <xnnpack/gemm.h>
21
+ #include <xnnpack/igemm.h>
22
+ #include <xnnpack/microfnptr.h>
23
+ #include <xnnpack/microparams-init.h>
24
+ #include <xnnpack/models.h>
25
+ #include <xnnpack/pack.h>
26
+
27
+
28
+ static void GEMMEnd2EndBenchmark(
29
+ benchmark::State& state,
30
+ models::ExecutionPlanFactory model_factory,
31
+ xnn_f16_gemm_minmax_ukernel_fn gemm_minmax,
32
+ xnn_f16_igemm_minmax_ukernel_fn igemm_minmax,
33
+ xnn_f16_gemm_minmax_ukernel_fn gemm1_minmax,
34
+ xnn_f16_igemm_minmax_ukernel_fn igemm1_minmax,
35
+ xnn_init_f16_minmax_params_fn init_params,
36
+ uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
37
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
38
+ {
39
+ if (isa_check != nullptr && !isa_check(state)) {
40
+ return;
41
+ }
42
+ if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
43
+ state.SkipWithError("failed to initialize XNNPACK");
44
+ return;
45
+ }
46
+
47
+ struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config();
48
+ if (gemm_config == nullptr) {
49
+ state.SkipWithError("hardware does not support F16 gemm");
50
+ return;
51
+ }
52
+
53
+ // Override microkernels chosen in xnn_initialize
54
+ std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config));
55
+ gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_minmax));
56
+ gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_minmax));
57
+ gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_minmax));
58
+ gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_minmax));
59
+ gemm_config->init.f16 = init_params;
60
+ gemm_config->mr = mr;
61
+ gemm_config->nr = nr;
62
+ gemm_config->log2_kr = log2_kr;
63
+ gemm_config->log2_sr = log2_sr;
64
+ gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f16_gemm_goi_w;
65
+
66
+ auto execution_plan = model_factory(nullptr);
67
+ if (execution_plan.empty()) {
68
+ state.SkipWithError("failed to create a model");
69
+ return;
70
+ }
71
+
72
+ for (auto _ : state) {
73
+ for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
74
+ xnn_status status = xnn_run_operator(op.get(), nullptr);
75
+ if (status != xnn_status_success) {
76
+ state.SkipWithError("failed to run a model");
77
+ return;
78
+ }
79
+ }
80
+ }
81
+
82
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
83
+ if (cpu_frequency != 0) {
84
+ state.counters["cpufreq"] = cpu_frequency;
85
+ }
86
+ }
87
+
88
+ #if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
89
+ static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
90
+ GEMMEnd2EndBenchmark(state, model,
91
+ xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
92
+ xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
93
+ xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
94
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
95
+ xnn_init_f16_minmax_fp16arith_params,
96
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
97
+ benchmark::utils::CheckNEONFP16ARITH);
98
+ }
99
+
100
+ static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
101
+ GEMMEnd2EndBenchmark(state, model,
102
+ xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
103
+ xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
104
+ xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
105
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
106
+ xnn_init_f16_minmax_fp16arith_params,
107
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
108
+ benchmark::utils::CheckNEONFP16ARITH);
109
+ }
110
+
111
+ static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
112
+ GEMMEnd2EndBenchmark(state, model,
113
+ xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
114
+ xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
115
+ xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
116
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
117
+ xnn_init_f16_minmax_fp16arith_params,
118
+ 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
119
+ benchmark::utils::CheckNEONFP16ARITH);
120
+ }
121
+
122
+ static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
123
+ GEMMEnd2EndBenchmark(state, model,
124
+ xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
125
+ xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
126
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
127
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
128
+ xnn_init_f16_minmax_fp16arith_params,
129
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
130
+ benchmark::utils::CheckNEONFP16ARITH);
131
+ }
132
+
133
+ static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
134
+ GEMMEnd2EndBenchmark(state, model,
135
+ xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
136
+ xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
137
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
138
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
139
+ xnn_init_f16_minmax_fp16arith_params,
140
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
141
+ benchmark::utils::CheckNEONFP16ARITH);
142
+ }
143
+
144
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
145
+ GEMMEnd2EndBenchmark(state, model,
146
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
147
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
148
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
149
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
150
+ xnn_init_f16_minmax_fp16arith_params,
151
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
152
+ benchmark::utils::CheckNEONFP16ARITH);
153
+ }
154
+
155
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
156
+ GEMMEnd2EndBenchmark(state, model,
157
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
158
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
159
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
160
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
161
+ xnn_init_f16_minmax_fp16arith_params,
162
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
163
+ benchmark::utils::CheckNEONFP16ARITH);
164
+ }
165
+
166
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
167
+ GEMMEnd2EndBenchmark(state, model,
168
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
169
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
170
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
171
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
172
+ xnn_init_f16_minmax_fp16arith_params,
173
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
174
+ benchmark::utils::CheckNEONFP16ARITH);
175
+ }
176
+
177
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, models::ExecutionPlanFactory model) {
178
+ GEMMEnd2EndBenchmark(state, model,
179
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
180
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
181
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
182
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
183
+ xnn_init_f16_minmax_fp16arith_params,
184
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
185
+ benchmark::utils::CheckNEONFP16ARITH);
186
+ }
187
+
188
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
189
+ GEMMEnd2EndBenchmark(state, model,
190
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
191
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
192
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
193
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
194
+ xnn_init_f16_minmax_fp16arith_params,
195
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
196
+ benchmark::utils::CheckNEONFP16ARITH);
197
+ }
198
+
199
+ BENCHMARK_FP16_END2END(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64);
200
+ BENCHMARK_FP16_END2END(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64);
201
+ BENCHMARK_FP16_END2END(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64);
202
+ BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32);
203
+ BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64);
204
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32);
205
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64);
206
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55);
207
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
208
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75);
209
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
210
+
211
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
212
+ static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
213
+ GEMMEnd2EndBenchmark(state, model,
214
+ xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
215
+ xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
216
+ xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
217
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
218
+ xnn_init_f16_minmax_fp16arith_params,
219
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
220
+ benchmark::utils::CheckNEONFP16ARITH);
221
+ }
222
+
223
+ static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
224
+ GEMMEnd2EndBenchmark(state, model,
225
+ xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
226
+ xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
227
+ xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
228
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
229
+ xnn_init_f16_minmax_fp16arith_params,
230
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
231
+ benchmark::utils::CheckNEONFP16ARITH);
232
+ }
233
+
234
+ static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
235
+ GEMMEnd2EndBenchmark(state, model,
236
+ xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
237
+ xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
238
+ xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
239
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
240
+ xnn_init_f16_minmax_fp16arith_params,
241
+ 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
242
+ benchmark::utils::CheckNEONFP16ARITH);
243
+ }
244
+
245
+ static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
246
+ GEMMEnd2EndBenchmark(state, model,
247
+ xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
248
+ xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
249
+ xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
250
+ xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
251
+ xnn_init_f16_minmax_fp16arith_params,
252
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
253
+ benchmark::utils::CheckNEONFP16ARITH);
254
+ }
255
+
256
+ static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
257
+ GEMMEnd2EndBenchmark(state, model,
258
+ xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
259
+ xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
260
+ xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
261
+ xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
262
+ xnn_init_f16_minmax_fp16arith_params,
263
+ 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
264
+ benchmark::utils::CheckNEONFP16ARITH);
265
+ }
266
+
267
+ static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
268
+ GEMMEnd2EndBenchmark(state, model,
269
+ xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
270
+ xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
271
+ xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
272
+ xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
273
+ xnn_init_f16_minmax_fp16arith_params,
274
+ 8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
275
+ benchmark::utils::CheckNEONFP16ARITH);
276
+ }
277
+
278
+ BENCHMARK_FP16_END2END(f16_gemm_4x8__neonfp16arith_ld64);
279
+ BENCHMARK_FP16_END2END(f16_gemm_6x8__neonfp16arith_ld64);
280
+ BENCHMARK_FP16_END2END(f16_gemm_8x8__neonfp16arith_ld64);
281
+ BENCHMARK_FP16_END2END(f16_gemm_4x16__neonfp16arith_ld64);
282
+ BENCHMARK_FP16_END2END(f16_gemm_6x16__neonfp16arith_ld64);
283
+ BENCHMARK_FP16_END2END(f16_gemm_8x16__neonfp16arith_ld64);
284
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
285
+
286
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
287
+ static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
288
+ GEMMEnd2EndBenchmark(state, model,
289
+ xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
290
+ xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
291
+ xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
292
+ xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
293
+ xnn_init_f16_minmax_avx_params,
294
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
295
+ benchmark::utils::CheckAVX2);
296
+ }
297
+ static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
298
+ GEMMEnd2EndBenchmark(state, model,
299
+ xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
300
+ xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
301
+ xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
302
+ xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
303
+ xnn_init_f16_minmax_avx_params,
304
+ 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
305
+ benchmark::utils::CheckAVX2);
306
+ }
307
+ static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
308
+ GEMMEnd2EndBenchmark(state, model,
309
+ xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
310
+ xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
311
+ xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
312
+ xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
313
+ xnn_init_f16_minmax_avx_params,
314
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
315
+ benchmark::utils::CheckAVX2);
316
+ }
317
+ static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
318
+ GEMMEnd2EndBenchmark(state, model,
319
+ xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
320
+ xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
321
+ xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
322
+ xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
323
+ xnn_init_f16_minmax_avx_params,
324
+ 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
325
+ benchmark::utils::CheckAVX2);
326
+ }
327
+
328
+ static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
329
+ GEMMEnd2EndBenchmark(state, model,
330
+ xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
331
+ xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
332
+ xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
333
+ xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
334
+ xnn_init_f16_minmax_avx_params,
335
+ 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
336
+ benchmark::utils::CheckAVX2);
337
+ }
338
+ static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
339
+ GEMMEnd2EndBenchmark(state, model,
340
+ xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
341
+ xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
342
+ xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
343
+ xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
344
+ xnn_init_f16_minmax_avx_params,
345
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
346
+ benchmark::utils::CheckAVX2);
347
+ }
348
+ static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
349
+ GEMMEnd2EndBenchmark(state, model,
350
+ xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
351
+ xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
352
+ xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
353
+ xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
354
+ xnn_init_f16_minmax_avx_params,
355
+ 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
356
+ benchmark::utils::CheckAVX2);
357
+ }
358
+
359
+ static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
360
+ GEMMEnd2EndBenchmark(state, model,
361
+ xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast,
362
+ xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast,
363
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
364
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
365
+ xnn_init_f16_minmax_avx_params,
366
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
367
+ benchmark::utils::CheckAVX2);
368
+ }
369
+ static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
370
+ GEMMEnd2EndBenchmark(state, model,
371
+ xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast,
372
+ xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast,
373
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
374
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
375
+ xnn_init_f16_minmax_avx_params,
376
+ 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
377
+ benchmark::utils::CheckAVX2);
378
+ }
379
+ static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
380
+ GEMMEnd2EndBenchmark(state, model,
381
+ xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast,
382
+ xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast,
383
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
384
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
385
+ xnn_init_f16_minmax_avx_params,
386
+ 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
387
+ benchmark::utils::CheckAVX2);
388
+ }
389
+ static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
390
+ GEMMEnd2EndBenchmark(state, model,
391
+ xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast,
392
+ xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast,
393
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
394
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
395
+ xnn_init_f16_minmax_avx_params,
396
+ 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
397
+ benchmark::utils::CheckAVX2);
398
+ }
399
+
400
+ static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
401
+ GEMMEnd2EndBenchmark(state, model,
402
+ xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast,
403
+ xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast,
404
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
405
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
406
+ xnn_init_f16_minmax_avx_params,
407
+ 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
408
+ benchmark::utils::CheckAVX2);
409
+ }
410
+ static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
411
+ GEMMEnd2EndBenchmark(state, model,
412
+ xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast,
413
+ xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast,
414
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
415
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
416
+ xnn_init_f16_minmax_avx_params,
417
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
418
+ benchmark::utils::CheckAVX2);
419
+ }
420
+ static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
421
+ GEMMEnd2EndBenchmark(state, model,
422
+ xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast,
423
+ xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast,
424
+ xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
425
+ xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
426
+ xnn_init_f16_minmax_avx_params,
427
+ 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
428
+ benchmark::utils::CheckAVX2);
429
+ }
430
+
431
+ BENCHMARK_FP16_END2END(f16_gemm_4x8__avx2_broadcast);
432
+ BENCHMARK_FP16_END2END(f16_gemm_5x8__avx2_broadcast);
433
+ BENCHMARK_FP16_END2END(f16_gemm_6x8__avx2_broadcast);
434
+ BENCHMARK_FP16_END2END(f16_gemm_7x8__avx2_broadcast);
435
+
436
+ BENCHMARK_FP16_END2END(f16_gemm_3x16__avx2_broadcast);
437
+ BENCHMARK_FP16_END2END(f16_gemm_4x16__avx2_broadcast);
438
+ BENCHMARK_FP16_END2END(f16_gemm_5x16__avx2_broadcast);
439
+
440
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x8__avx2_broadcast);
441
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x8__avx2_broadcast);
442
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_6x8__avx2_broadcast);
443
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_7x8__avx2_broadcast);
444
+
445
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_3x16__avx2_broadcast);
446
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x16__avx2_broadcast);
447
+ BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x16__avx2_broadcast);
448
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
449
+
450
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
451
+ BENCHMARK_MAIN();
452
+ #endif
bench/f16-gemm.cc ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // Copyright 2019 Google LLC
5
+ //
6
+ // This source code is licensed under the BSD-style license found in the
7
+ // LICENSE file in the root directory of this source tree.
8
+
9
+ #include <algorithm>
10
+ #include <cfloat>
11
+ #include <cmath>
12
+ #include <functional>
13
+ #include <random>
14
+ #include <vector>
15
+
16
+ #include <benchmark/benchmark.h>
17
+ #include <fp16/fp16.h>
18
+ #include "bench/gemm.h"
19
+ #include "bench/utils.h"
20
+
21
+ #include <xnnpack.h>
22
+ #include <xnnpack/aligned-allocator.h>
23
+ #include <xnnpack/common.h>
24
+ #include <xnnpack/gemm.h>
25
+ #include <xnnpack/math.h>
26
+ #include <xnnpack/pack.h>
27
+ #include <xnnpack/microfnptr.h>
28
+ #include <xnnpack/microparams-init.h>
29
+
30
+
31
+ static void f16_gemm(benchmark::State& state,
32
+ xnn_f16_gemm_minmax_ukernel_fn gemm,
33
+ xnn_init_f16_minmax_params_fn init_params,
34
+ size_t mr, size_t nr, size_t kr, size_t sr,
35
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
36
+ {
37
+ if (isa_check != nullptr && !isa_check(state)) {
38
+ return;
39
+ }
40
+
41
+ const size_t mc = state.range(0);
42
+ const size_t nc = state.range(1);
43
+ const size_t kc = state.range(2);
44
+
45
+ const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
46
+ const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
47
+
48
+ std::random_device random_device;
49
+ auto rng = std::mt19937(random_device());
50
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
51
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
52
+
53
+ std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
54
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
55
+ std::vector<uint16_t> k(nc * kc);
56
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
57
+ std::vector<uint16_t> b(nc);
58
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
59
+
60
+ const size_t w_elements = nc_stride * kc_stride + nc_stride;
61
+ const size_t c_elements = mc * nc;
62
+ const size_t num_buffers = 1 +
63
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
64
+ sizeof(uint16_t) * (w_elements + c_elements));
65
+
66
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
67
+ std::fill(w.begin(), w.end(), 0);
68
+ xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
69
+ std::vector<uint16_t> c(c_elements * num_buffers);
70
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
71
+
72
+ // Prepare minmax parameters.
73
+ xnn_f16_minmax_params params;
74
+ init_params(&params,
75
+ UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
76
+
77
+ size_t buffer_index = 0;
78
+ for (auto _ : state) {
79
+ // Use circular buffers (exceeding cache size) and prefetch to control cache state:
80
+ // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
81
+ // - W is not in cache (for any cache level)
82
+ // - C is not in cache (for any cache level)
83
+ state.PauseTiming();
84
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
85
+ buffer_index = (buffer_index + 1) % num_buffers;
86
+ state.ResumeTiming();
87
+
88
+ for (uint32_t m = 0; m < mc; m += mr) {
89
+ const uint32_t mb = min(mc - m, mr);
90
+ for (uint32_t n = 0; n < nc; n += nr) {
91
+ const uint32_t nb = min(nc - n, nr);
92
+ gemm(
93
+ mb, nb, kc * sizeof(uint16_t),
94
+ a.data() + m * kc, kc * sizeof(uint16_t),
95
+ w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
96
+ c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
97
+ &params);
98
+ }
99
+ }
100
+ }
101
+
102
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
103
+ if (cpu_frequency != 0) {
104
+ state.counters["cpufreq"] = cpu_frequency;
105
+ }
106
+
107
+ state.counters["FLOPS"] = benchmark::Counter(
108
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
109
+ }
110
+
111
+
112
+ #if XNN_PLATFORM_JIT
113
+ static void f16_gemm(benchmark::State& state,
114
+ xnn_jit_gemm_code_generator_fn generator,
115
+ xnn_init_f16_minmax_params_fn init_params,
116
+ size_t mr, size_t nr, size_t kr, size_t sr,
117
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
118
+ {
119
+ if (isa_check != nullptr && !isa_check(state)) {
120
+ return;
121
+ }
122
+
123
+ const size_t mc = state.range(0);
124
+ const size_t nc = state.range(1);
125
+ const size_t kc = state.range(2);
126
+
127
+ const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
128
+ const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
129
+
130
+ std::random_device random_device;
131
+ auto rng = std::mt19937(random_device());
132
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
133
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
134
+
135
+ std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
136
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
137
+ std::vector<uint16_t> k(nc * kc);
138
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
139
+ std::vector<uint16_t> b(nc);
140
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
141
+
142
+ const size_t w_elements = nc_stride * kc_stride + nc_stride;
143
+ const size_t c_elements = mc * nc;
144
+ const size_t num_buffers = 1 +
145
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
146
+ sizeof(uint16_t) * (w_elements + c_elements));
147
+
148
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
149
+ std::fill(w.begin(), w.end(), 0);
150
+ xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
151
+ std::vector<uint16_t> c(c_elements * num_buffers);
152
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
153
+
154
+ // Prepare minmax parameters.
155
+ xnn_f16_minmax_params params;
156
+ init_params(&params,
157
+ UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
158
+
159
+ jit_gemm_params jit_params = {};
160
+ jit_params.f16_minmax.min = UINT16_C(0xFC00); /* -inf */
161
+ jit_params.f16_minmax.max = UINT16_C(0x7C00); /* inf */
162
+
163
+ xnn_code_buffer code_buffer;
164
+ xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
165
+ generator(&code_buffer, mr, nc % nr, kc * sizeof(float), &jit_params);
166
+ xnn_finalize_code_memory(&code_buffer);
167
+ xnn_f16_gemm_minmax_ukernel_fn gemm = reinterpret_cast<xnn_f16_gemm_minmax_ukernel_fn>(code_buffer.start);
168
+
169
+ size_t buffer_index = 0;
170
+ for (auto _ : state) {
171
+ // Use circular buffers (exceeding cache size) and prefetch to control cache state:
172
+ // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
173
+ // - W is not in cache (for any cache level)
174
+ // - C is not in cache (for any cache level)
175
+ state.PauseTiming();
176
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
177
+ buffer_index = (buffer_index + 1) % num_buffers;
178
+ state.ResumeTiming();
179
+
180
+ for (uint32_t m = 0; m < mc; m += mr) {
181
+ const uint32_t mb = min(mc - m, mr);
182
+ for (uint32_t n = 0; n < nc; n += nr) {
183
+ const uint32_t nb = min(nc - n, nr);
184
+ gemm(
185
+ mb, nb, kc * sizeof(uint16_t),
186
+ a.data() + m * kc, kc * sizeof(uint16_t),
187
+ w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
188
+ c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
189
+ &params);
190
+ }
191
+ }
192
+ }
193
+
194
+ xnn_release_code_memory(&code_buffer);
195
+
196
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
197
+ if (cpu_frequency != 0) {
198
+ state.counters["cpufreq"] = cpu_frequency;
199
+ }
200
+
201
+ state.counters["FLOPS"] = benchmark::Counter(
202
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
203
+ }
204
+ #endif // XNN_PLATFORM_JIT
205
+
206
+
207
+ #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
208
+ static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
209
+ f16_gemm(state,
210
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
211
+ xnn_init_f16_minmax_fp16arith_params,
212
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
213
+ benchmark::utils::CheckNEONFP16ARITH);
214
+ }
215
+ static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
216
+ f16_gemm(state,
217
+ xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
218
+ xnn_init_f16_minmax_fp16arith_params,
219
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
220
+ benchmark::utils::CheckNEONFP16ARITH);
221
+ }
222
+ static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
223
+ f16_gemm(state,
224
+ xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
225
+ xnn_init_f16_minmax_fp16arith_params,
226
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
227
+ benchmark::utils::CheckNEONFP16ARITH);
228
+ }
229
+ static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
230
+ f16_gemm(state,
231
+ xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
232
+ xnn_init_f16_minmax_fp16arith_params,
233
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
234
+ benchmark::utils::CheckNEONFP16ARITH);
235
+ }
236
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
237
+ f16_gemm(state,
238
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
239
+ xnn_init_f16_minmax_fp16arith_params,
240
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
241
+ benchmark::utils::CheckNEONFP16ARITH);
242
+ }
243
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
244
+ f16_gemm(state,
245
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
246
+ xnn_init_f16_minmax_fp16arith_params,
247
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
248
+ benchmark::utils::CheckNEONFP16ARITH);
249
+ }
250
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
251
+ f16_gemm(state,
252
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
253
+ xnn_init_f16_minmax_fp16arith_params,
254
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
255
+ benchmark::utils::CheckNEONFP16ARITH);
256
+ }
257
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
258
+ f16_gemm(state,
259
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
260
+ xnn_init_f16_minmax_fp16arith_params,
261
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
262
+ benchmark::utils::CheckNEONFP16ARITH);
263
+ }
264
+ static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
265
+ f16_gemm(state,
266
+ xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
267
+ xnn_init_f16_minmax_fp16arith_params,
268
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
269
+ benchmark::utils::CheckNEONFP16ARITH);
270
+ }
271
+ static void f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
272
+ f16_gemm(state,
273
+ xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
274
+ xnn_init_f16_minmax_fp16arith_params,
275
+ /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
276
+ benchmark::utils::CheckNEONFP16ARITH);
277
+ }
278
+ static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
279
+ f16_gemm(state,
280
+ xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
281
+ xnn_init_f16_minmax_fp16arith_params,
282
+ /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
283
+ benchmark::utils::CheckNEONFP16ARITH);
284
+ }
285
+ static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
286
+ f16_gemm(state,
287
+ xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
288
+ xnn_init_f16_minmax_fp16arith_params,
289
+ /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
290
+ benchmark::utils::CheckNEONFP16ARITH);
291
+ }
292
+ static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
293
+ f16_gemm(state,
294
+ xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
295
+ xnn_init_f16_minmax_fp16arith_params,
296
+ /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
297
+ benchmark::utils::CheckNEONFP16ARITH);
298
+ }
299
+
300
+ BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32)
301
+ BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64)
302
+ BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32)
303
+ BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64)
304
+ BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
305
+ BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
306
+ BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
307
+ BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32)
308
+ BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64)
309
+ BENCHMARK_GEMM(f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64)
310
+ BENCHMARK_GEMM(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64)
311
+ BENCHMARK_GEMM(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64)
312
+ BENCHMARK_GEMM(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64)
313
+ #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
314
+
315
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
316
+ static void f16_gemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
317
+ f16_gemm(state,
318
+ xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
319
+ xnn_init_f16_minmax_fp16arith_params,
320
+ /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
321
+ benchmark::utils::CheckNEONFP16ARITH);
322
+ }
323
+ static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
324
+ f16_gemm(state,
325
+ xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
326
+ xnn_init_f16_minmax_fp16arith_params,
327
+ /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
328
+ benchmark::utils::CheckNEONFP16ARITH);
329
+ }
330
+ static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
331
+ f16_gemm(state,
332
+ xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
333
+ xnn_init_f16_minmax_fp16arith_params,
334
+ /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
335
+ benchmark::utils::CheckNEONFP16ARITH);
336
+ }
337
+ static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
338
+ f16_gemm(state,
339
+ xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
340
+ xnn_init_f16_minmax_fp16arith_params,
341
+ /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
342
+ benchmark::utils::CheckNEONFP16ARITH);
343
+ }
344
+ static void f16_gemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
345
+ f16_gemm(state,
346
+ xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
347
+ xnn_init_f16_minmax_fp16arith_params,
348
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
349
+ benchmark::utils::CheckNEONFP16ARITH);
350
+ }
351
+ static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
352
+ f16_gemm(state,
353
+ xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
354
+ xnn_init_f16_minmax_fp16arith_params,
355
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
356
+ benchmark::utils::CheckNEONFP16ARITH);
357
+ }
358
+ static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
359
+ f16_gemm(state,
360
+ xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
361
+ xnn_init_f16_minmax_fp16arith_params,
362
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
363
+ benchmark::utils::CheckNEONFP16ARITH);
364
+ }
365
+ static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
366
+ f16_gemm(state,
367
+ xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
368
+ xnn_init_f16_minmax_fp16arith_params,
369
+ /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
370
+ benchmark::utils::CheckNEONFP16ARITH);
371
+ }
372
+
373
+ BENCHMARK_GEMM(f16_gemm_1x8__neonfp16arith_ld64)
374
+ BENCHMARK_GEMM(f16_gemm_4x8__neonfp16arith_ld64)
375
+ BENCHMARK_GEMM(f16_gemm_6x8__neonfp16arith_ld64)
376
+ BENCHMARK_GEMM(f16_gemm_8x8__neonfp16arith_ld64)
377
+ BENCHMARK_GEMM(f16_gemm_1x16__neonfp16arith_ld64)
378
+ BENCHMARK_GEMM(f16_gemm_4x16__neonfp16arith_ld64)
379
+ BENCHMARK_GEMM(f16_gemm_6x16__neonfp16arith_ld64)
380
+ BENCHMARK_GEMM(f16_gemm_8x16__neonfp16arith_ld64)
381
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
382
+
383
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
384
+ static void f16_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
385
+ f16_gemm(state,
386
+ xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
387
+ xnn_init_f16_minmax_avx_params,
388
+ /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
389
+ benchmark::utils::CheckAVX2);
390
+ }
391
+ static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
392
+ f16_gemm(state,
393
+ xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
394
+ xnn_init_f16_minmax_avx_params,
395
+ /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
396
+ benchmark::utils::CheckAVX2);
397
+ }
398
+ static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
399
+ f16_gemm(state,
400
+ xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
401
+ xnn_init_f16_minmax_avx_params,
402
+ /*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
403
+ benchmark::utils::CheckAVX2);
404
+ }
405
+ static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
406
+ f16_gemm(state,
407
+ xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
408
+ xnn_init_f16_minmax_avx_params,
409
+ /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
410
+ benchmark::utils::CheckAVX2);
411
+ }
412
+ static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
413
+ f16_gemm(state,
414
+ xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
415
+ xnn_init_f16_minmax_avx_params,
416
+ /*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
417
+ benchmark::utils::CheckAVX2);
418
+ }
419
+ static void f16_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
420
+ f16_gemm(state,
421
+ xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
422
+ xnn_init_f16_minmax_avx_params,
423
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
424
+ benchmark::utils::CheckAVX2);
425
+ }
426
+ static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
427
+ f16_gemm(state,
428
+ xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
429
+ xnn_init_f16_minmax_avx_params,
430
+ /*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
431
+ benchmark::utils::CheckAVX2);
432
+ }
433
+ static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
434
+ f16_gemm(state,
435
+ xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
436
+ xnn_init_f16_minmax_avx_params,
437
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
438
+ benchmark::utils::CheckAVX2);
439
+ }
440
+ static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
441
+ f16_gemm(state,
442
+ xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
443
+ xnn_init_f16_minmax_avx_params,
444
+ /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
445
+ benchmark::utils::CheckAVX2);
446
+ }
447
+
448
+ BENCHMARK_GEMM(f16_gemm_1x8__avx2_broadcast)
449
+ BENCHMARK_GEMM(f16_gemm_4x8__avx2_broadcast)
450
+ BENCHMARK_GEMM(f16_gemm_5x8__avx2_broadcast)
451
+ BENCHMARK_GEMM(f16_gemm_6x8__avx2_broadcast)
452
+ BENCHMARK_GEMM(f16_gemm_7x8__avx2_broadcast)
453
+ BENCHMARK_GEMM(f16_gemm_1x16__avx2_broadcast)
454
+ BENCHMARK_GEMM(f16_gemm_3x16__avx2_broadcast)
455
+ BENCHMARK_GEMM(f16_gemm_4x16__avx2_broadcast)
456
+ BENCHMARK_GEMM(f16_gemm_5x16__avx2_broadcast)
457
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
458
+
459
+ #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
460
+ static void f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
461
+ f16_gemm(state,
462
+ xnn_generate_f16_gemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
463
+ xnn_init_f16_minmax_fp16arith_params,
464
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
465
+ benchmark::utils::CheckNEONFP16ARITH);
466
+ }
467
+ static void f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
468
+ f16_gemm(state,
469
+ xnn_generate_f16_gemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
470
+ xnn_init_f16_minmax_fp16arith_params,
471
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
472
+ benchmark::utils::CheckNEONFP16ARITH);
473
+ }
474
+ static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
475
+ f16_gemm(state,
476
+ xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
477
+ xnn_init_f16_minmax_fp16arith_params,
478
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
479
+ benchmark::utils::CheckNEONFP16ARITH);
480
+ }
481
+ static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
482
+ f16_gemm(state,
483
+ xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
484
+ xnn_init_f16_minmax_fp16arith_params,
485
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
486
+ benchmark::utils::CheckNEONFP16ARITH);
487
+ }
488
+ static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
489
+ f16_gemm(state,
490
+ xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
491
+ xnn_init_f16_minmax_fp16arith_params,
492
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
493
+ benchmark::utils::CheckNEONFP16ARITH);
494
+ }
495
+ static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
496
+ f16_gemm(state,
497
+ xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
498
+ xnn_init_f16_minmax_fp16arith_params,
499
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
500
+ benchmark::utils::CheckNEONFP16ARITH);
501
+ }
502
+
503
+ BENCHMARK_GEMM(f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
504
+ BENCHMARK_GEMM(f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
505
+ BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
506
+ BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
507
+ BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
508
+ BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
509
+ #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
510
+
511
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
512
+ BENCHMARK_MAIN();
513
+ #endif
bench/f16-igemm.cc ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cfloat>
8
+ #include <cmath>
9
+ #include <functional>
10
+ #include <random>
11
+ #include <vector>
12
+
13
+ #include <benchmark/benchmark.h>
14
+ #include <fp16/fp16.h>
15
+ #include "bench/conv.h"
16
+ #include "bench/utils.h"
17
+
18
+ #include <xnnpack.h>
19
+ #include <xnnpack/aligned-allocator.h>
20
+ #include <xnnpack/common.h>
21
+ #include <xnnpack/igemm.h>
22
+ #include <xnnpack/indirection.h>
23
+ #include <xnnpack/microfnptr.h>
24
+ #include <xnnpack/microparams-init.h>
25
+ #include <xnnpack/operator.h>
26
+ #include <xnnpack/pack.h>
27
+
28
+
29
+ static void f16_igemm(benchmark::State& state,
30
+ xnn_f16_igemm_minmax_ukernel_fn igemm,
31
+ xnn_init_f16_minmax_params_fn init_params,
32
+ uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
33
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
34
+ {
35
+ if (isa_check != nullptr && !isa_check(state)) {
36
+ return;
37
+ }
38
+
39
+ const size_t input_height = state.range(0);
40
+ const size_t input_width = state.range(1);
41
+ const size_t kernel_height = state.range(2);
42
+ const size_t kernel_width = state.range(3);
43
+ const size_t kernel_size = kernel_height * kernel_width;
44
+ const size_t padding_height = state.range(4);
45
+ const size_t padding_width = state.range(5);
46
+ const size_t subsampling = state.range(6);
47
+ const size_t dilation = state.range(7);
48
+ const size_t group_input_channels = state.range(8);
49
+ const size_t group_output_channels = state.range(9);
50
+
51
+ std::random_device random_device;
52
+ auto rng = std::mt19937(random_device());
53
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
54
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
55
+
56
+ const size_t output_pixel_stride = group_output_channels;
57
+ const size_t input_pixel_stride = group_input_channels;
58
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
59
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
60
+ const size_t padding_left = padding_width / 2;
61
+ const size_t padding_top = padding_height / 2;
62
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
63
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
64
+ const size_t output_size = output_height * output_width;
65
+
66
+ const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
67
+ const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
68
+ const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
69
+
70
+ std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
71
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
72
+ std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
73
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
74
+ std::vector<uint16_t> b(group_output_channels);
75
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
76
+
77
+ std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
78
+
79
+ const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
80
+ const size_t i_elements = mc_stride * kernel_size;
81
+ const size_t c_elements = output_height * output_width * output_pixel_stride;
82
+ const size_t num_buffers = 1 +
83
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
84
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
85
+
86
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
87
+ std::fill(w.begin(), w.end(), 0);
88
+ xnn_pack_f16_conv_goki_w(
89
+ 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
90
+ nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
91
+ for (size_t n = 1; n < num_buffers; n++) {
92
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
93
+ }
94
+
95
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
96
+ xnn_operator convolution_op = { };
97
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
98
+ convolution_op.input = a.data();
99
+ convolution_op.input_pixel_stride = input_pixel_stride;
100
+ convolution_op.zero_buffer = z.data();
101
+ convolution_op.groups = 1;
102
+ convolution_op.group_input_channels = group_input_channels;
103
+ convolution_op.batch_size = 1;
104
+ convolution_op.input_height = input_height;
105
+ convolution_op.input_width = input_width;
106
+ convolution_op.output_height = output_height;
107
+ convolution_op.output_width = output_width;
108
+ convolution_op.kernel_height = kernel_height;
109
+ convolution_op.kernel_width = kernel_width;
110
+ convolution_op.stride_height = subsampling;
111
+ convolution_op.stride_width = subsampling;
112
+ convolution_op.dilation_height = dilation;
113
+ convolution_op.dilation_width = dilation;
114
+ convolution_op.padding_top = padding_top;
115
+ convolution_op.padding_left = padding_left;
116
+ xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
117
+ for (size_t n = 1; n < num_buffers; n++) {
118
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
119
+ }
120
+
121
+ std::vector<uint16_t> c(c_elements * num_buffers);
122
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
123
+
124
+ // Prepare minmax parameters.
125
+ xnn_f16_minmax_params params;
126
+ init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
127
+
128
+ size_t buffer_index = 0;
129
+ for (auto _ : state) {
130
+ state.PauseTiming();
131
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
132
+ buffer_index = (buffer_index + 1) % num_buffers;
133
+ state.ResumeTiming();
134
+
135
+ for (uint32_t m = 0; m < output_size; m += mr) {
136
+ const uint32_t mb = min(output_size - m, mr);
137
+ for (uint32_t n = 0; n < group_output_channels; n += nr) {
138
+ const uint32_t nb = min(group_output_channels - n, nr);
139
+ igemm(
140
+ mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
141
+ reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
142
+ w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
143
+ c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
144
+ 0, z.data(), &params);
145
+ }
146
+ }
147
+ }
148
+
149
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
150
+ if (cpu_frequency != 0) {
151
+ state.counters["cpufreq"] = cpu_frequency;
152
+ }
153
+
154
+ state.counters["FLOPS"] = benchmark::Counter(
155
+ uint64_t(state.iterations()) * 2 *
156
+ output_height * output_width *
157
+ group_input_channels * group_output_channels *
158
+ kernel_height * kernel_width,
159
+ benchmark::Counter::kIsRate);
160
+ }
161
+
162
+ #if XNN_PLATFORM_JIT
163
+ static void f16_igemm(benchmark::State& state,
164
+ xnn_jit_igemm_code_generator_fn generator,
165
+ xnn_init_f16_minmax_params_fn init_params,
166
+ uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
167
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
168
+ {
169
+ if (isa_check != nullptr && !isa_check(state)) {
170
+ return;
171
+ }
172
+
173
+ const size_t input_height = state.range(0);
174
+ const size_t input_width = state.range(1);
175
+ const size_t kernel_height = state.range(2);
176
+ const size_t kernel_width = state.range(3);
177
+ const size_t kernel_size = kernel_height * kernel_width;
178
+ const size_t padding_height = state.range(4);
179
+ const size_t padding_width = state.range(5);
180
+ const size_t subsampling = state.range(6);
181
+ const size_t dilation = state.range(7);
182
+ const size_t group_input_channels = state.range(8);
183
+ const size_t group_output_channels = state.range(9);
184
+
185
+ std::random_device random_device;
186
+ auto rng = std::mt19937(random_device());
187
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
188
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
189
+
190
+ const size_t output_pixel_stride = group_output_channels;
191
+ const size_t input_pixel_stride = group_input_channels;
192
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
193
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
194
+ const size_t padding_left = padding_width / 2;
195
+ const size_t padding_top = padding_height / 2;
196
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
197
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
198
+ const size_t output_size = output_height * output_width;
199
+
200
+ const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
201
+ const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
202
+ const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
203
+
204
+ std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
205
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
206
+ std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
207
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
208
+ std::vector<uint16_t> b(group_output_channels);
209
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
210
+
211
+ std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
212
+
213
+ const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
214
+ const size_t i_elements = mc_stride * kernel_size;
215
+ const size_t c_elements = output_height * output_width * output_pixel_stride;
216
+ const size_t num_buffers = 1 +
217
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
218
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
219
+
220
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
221
+ std::fill(w.begin(), w.end(), 0);
222
+ xnn_pack_f16_conv_goki_w(
223
+ 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
224
+ nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
225
+ for (size_t n = 1; n < num_buffers; n++) {
226
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
227
+ }
228
+
229
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
230
+ xnn_operator convolution_op = { };
231
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
232
+ convolution_op.input = a.data();
233
+ convolution_op.input_pixel_stride = input_pixel_stride;
234
+ convolution_op.zero_buffer = z.data();
235
+ convolution_op.groups = 1;
236
+ convolution_op.group_input_channels = group_input_channels;
237
+ convolution_op.batch_size = 1;
238
+ convolution_op.input_height = input_height;
239
+ convolution_op.input_width = input_width;
240
+ convolution_op.output_height = output_height;
241
+ convolution_op.output_width = output_width;
242
+ convolution_op.kernel_height = kernel_height;
243
+ convolution_op.kernel_width = kernel_width;
244
+ convolution_op.stride_height = subsampling;
245
+ convolution_op.stride_width = subsampling;
246
+ convolution_op.dilation_height = dilation;
247
+ convolution_op.dilation_width = dilation;
248
+ convolution_op.padding_top = padding_top;
249
+ convolution_op.padding_left = padding_left;
250
+ xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
251
+ for (size_t n = 1; n < num_buffers; n++) {
252
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
253
+ }
254
+
255
+ std::vector<uint16_t> c(c_elements * num_buffers);
256
+ std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
257
+
258
+ // Prepare minmax parameters.
259
+ xnn_f16_minmax_params params;
260
+ init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
261
+
262
+ jit_gemm_params jit_params = {};
263
+ jit_params.f16_minmax.min = UINT16_C(0xFC00); /* -inf */
264
+ jit_params.f16_minmax.max = UINT16_C(0x7C00); /* inf */
265
+
266
+ xnn_code_buffer code_buffer;
267
+ xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
268
+ generator(&code_buffer,
269
+ mr,
270
+ group_output_channels % nr,
271
+ group_input_channels * sizeof(uint16_t),
272
+ kernel_size * mr * sizeof(void *),
273
+ &jit_params);
274
+ xnn_finalize_code_memory(&code_buffer);
275
+ auto igemm = reinterpret_cast<xnn_f16_igemm_minmax_ukernel_fn>(code_buffer.start);
276
+
277
+ size_t buffer_index = 0;
278
+ for (auto _ : state) {
279
+ state.PauseTiming();
280
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
281
+ buffer_index = (buffer_index + 1) % num_buffers;
282
+ state.ResumeTiming();
283
+
284
+ for (uint32_t m = 0; m < output_size; m += mr) {
285
+ const uint32_t mb = min(output_size - m, mr);
286
+ for (uint32_t n = 0; n < group_output_channels; n += nr) {
287
+ const uint32_t nb = min(group_output_channels - n, nr);
288
+ igemm(
289
+ mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
290
+ reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
291
+ w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
292
+ c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
293
+ 0, z.data(), &params);
294
+ }
295
+ }
296
+ }
297
+ xnn_release_code_memory(&code_buffer);
298
+
299
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
300
+ if (cpu_frequency != 0) {
301
+ state.counters["cpufreq"] = cpu_frequency;
302
+ }
303
+
304
+ state.counters["FLOPS"] = benchmark::Counter(
305
+ uint64_t(state.iterations()) * 2 *
306
+ output_height * output_width *
307
+ group_input_channels * group_output_channels *
308
+ kernel_height * kernel_width,
309
+ benchmark::Counter::kIsRate);
310
+ }
311
+ #endif // XNN_PLATFORM_JIT
312
+
313
+ #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
314
+ static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
315
+ f16_igemm(state,
316
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
317
+ xnn_init_f16_minmax_fp16arith_params,
318
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
319
+ benchmark::utils::CheckNEONFP16ARITH);
320
+ }
321
+ static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
322
+ f16_igemm(state,
323
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
324
+ xnn_init_f16_minmax_fp16arith_params,
325
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
326
+ benchmark::utils::CheckNEONFP16ARITH);
327
+ }
328
+ static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
329
+ f16_igemm(state,
330
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
331
+ xnn_init_f16_minmax_fp16arith_params,
332
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
333
+ benchmark::utils::CheckNEONFP16ARITH);
334
+ }
335
+ static void f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
336
+ f16_igemm(state,
337
+ xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
338
+ xnn_init_f16_minmax_fp16arith_params,
339
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
340
+ benchmark::utils::CheckNEONFP16ARITH);
341
+ }
342
+ static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
343
+ f16_igemm(state,
344
+ xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
345
+ xnn_init_f16_minmax_fp16arith_params,
346
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
347
+ benchmark::utils::CheckNEONFP16ARITH);
348
+ }
349
+ static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
350
+ f16_igemm(state,
351
+ xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
352
+ xnn_init_f16_minmax_fp16arith_params,
353
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
354
+ benchmark::utils::CheckNEONFP16ARITH);
355
+ }
356
+ static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
357
+ f16_igemm(state,
358
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
359
+ xnn_init_f16_minmax_fp16arith_params,
360
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
361
+ benchmark::utils::CheckNEONFP16ARITH);
362
+ }
363
+ static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
364
+ f16_igemm(state,
365
+ xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
366
+ xnn_init_f16_minmax_fp16arith_params,
367
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
368
+ benchmark::utils::CheckNEONFP16ARITH);
369
+ }
370
+
371
+ BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
372
+ BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
373
+ BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
374
+ BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64)
375
+ BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32)
376
+ BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64)
377
+ BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32)
378
+ BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64)
379
+ #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
380
+
381
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
382
+ static void f16_igemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
383
+ f16_igemm(state,
384
+ xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
385
+ xnn_init_f16_minmax_fp16arith_params,
386
+ /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
387
+ benchmark::utils::CheckNEONFP16ARITH);
388
+ }
389
+ static void f16_igemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
390
+ f16_igemm(state,
391
+ xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
392
+ xnn_init_f16_minmax_fp16arith_params,
393
+ /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
394
+ benchmark::utils::CheckNEONFP16ARITH);
395
+ }
396
+ static void f16_igemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
397
+ f16_igemm(state,
398
+ xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
399
+ xnn_init_f16_minmax_fp16arith_params,
400
+ /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
401
+ benchmark::utils::CheckNEONFP16ARITH);
402
+ }
403
+ static void f16_igemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
404
+ f16_igemm(state,
405
+ xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
406
+ xnn_init_f16_minmax_fp16arith_params,
407
+ /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
408
+ benchmark::utils::CheckNEONFP16ARITH);
409
+ }
410
+ static void f16_igemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
411
+ f16_igemm(state,
412
+ xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
413
+ xnn_init_f16_minmax_fp16arith_params,
414
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
415
+ benchmark::utils::CheckNEONFP16ARITH);
416
+ }
417
+ static void f16_igemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
418
+ f16_igemm(state,
419
+ xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
420
+ xnn_init_f16_minmax_fp16arith_params,
421
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
422
+ benchmark::utils::CheckNEONFP16ARITH);
423
+ }
424
+ static void f16_igemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
425
+ f16_igemm(state,
426
+ xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
427
+ xnn_init_f16_minmax_fp16arith_params,
428
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
429
+ benchmark::utils::CheckNEONFP16ARITH);
430
+ }
431
+ static void f16_igemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
432
+ f16_igemm(state,
433
+ xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
434
+ xnn_init_f16_minmax_fp16arith_params,
435
+ /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
436
+ benchmark::utils::CheckNEONFP16ARITH);
437
+ }
438
+
439
+ BENCHMARK_CONV(f16_igemm_1x8__neonfp16arith_ld64)
440
+ BENCHMARK_CONV(f16_igemm_4x8__neonfp16arith_ld64)
441
+ BENCHMARK_CONV(f16_igemm_6x8__neonfp16arith_ld64)
442
+ BENCHMARK_CONV(f16_igemm_8x8__neonfp16arith_ld64)
443
+ BENCHMARK_CONV(f16_igemm_1x16__neonfp16arith_ld64)
444
+ BENCHMARK_CONV(f16_igemm_4x16__neonfp16arith_ld64)
445
+ BENCHMARK_CONV(f16_igemm_6x16__neonfp16arith_ld64)
446
+ BENCHMARK_CONV(f16_igemm_8x16__neonfp16arith_ld64)
447
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
448
+
449
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
450
+ static void f16_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
451
+ f16_igemm(state,
452
+ xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
453
+ xnn_init_f16_minmax_avx_params,
454
+ /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
455
+ benchmark::utils::CheckAVX2);
456
+ }
457
+ static void f16_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
458
+ f16_igemm(state,
459
+ xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
460
+ xnn_init_f16_minmax_avx_params,
461
+ /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
462
+ benchmark::utils::CheckAVX2);
463
+ }
464
+ static void f16_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
465
+ f16_igemm(state,
466
+ xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
467
+ xnn_init_f16_minmax_avx_params,
468
+ /*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
469
+ benchmark::utils::CheckAVX2);
470
+ }
471
+ static void f16_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
472
+ f16_igemm(state,
473
+ xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
474
+ xnn_init_f16_minmax_avx_params,
475
+ /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
476
+ benchmark::utils::CheckAVX2);
477
+ }
478
+ static void f16_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
479
+ f16_igemm(state,
480
+ xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
481
+ xnn_init_f16_minmax_avx_params,
482
+ /*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
483
+ benchmark::utils::CheckAVX2);
484
+ }
485
+ static void f16_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
486
+ f16_igemm(state,
487
+ xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
488
+ xnn_init_f16_minmax_avx_params,
489
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
490
+ benchmark::utils::CheckAVX2);
491
+ }
492
+ static void f16_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
493
+ f16_igemm(state,
494
+ xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
495
+ xnn_init_f16_minmax_avx_params,
496
+ /*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
497
+ benchmark::utils::CheckAVX2);
498
+ }
499
+ static void f16_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
500
+ f16_igemm(state,
501
+ xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
502
+ xnn_init_f16_minmax_avx_params,
503
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
504
+ benchmark::utils::CheckAVX2);
505
+ }
506
+ static void f16_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
507
+ f16_igemm(state,
508
+ xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
509
+ xnn_init_f16_minmax_avx_params,
510
+ /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
511
+ benchmark::utils::CheckAVX2);
512
+ }
513
+
514
+ BENCHMARK_CONV(f16_igemm_1x8__avx2_broadcast)
515
+ BENCHMARK_CONV(f16_igemm_4x8__avx2_broadcast)
516
+ BENCHMARK_CONV(f16_igemm_5x8__avx2_broadcast)
517
+ BENCHMARK_CONV(f16_igemm_6x8__avx2_broadcast)
518
+ BENCHMARK_CONV(f16_igemm_7x8__avx2_broadcast)
519
+ BENCHMARK_CONV(f16_igemm_1x16__avx2_broadcast)
520
+ BENCHMARK_CONV(f16_igemm_3x16__avx2_broadcast)
521
+ BENCHMARK_CONV(f16_igemm_4x16__avx2_broadcast)
522
+ BENCHMARK_CONV(f16_igemm_5x16__avx2_broadcast)
523
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
524
+
525
+ #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
526
+ static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
527
+ f16_igemm(state,
528
+ xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
529
+ xnn_init_f16_minmax_fp16arith_params,
530
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
531
+ benchmark::utils::CheckNEONFP16ARITH);
532
+ }
533
+ static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
534
+ f16_igemm(state,
535
+ xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
536
+ xnn_init_f16_minmax_fp16arith_params,
537
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
538
+ benchmark::utils::CheckNEONFP16ARITH);
539
+ }
540
+ static void f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
541
+ f16_igemm(state,
542
+ xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
543
+ xnn_init_f16_minmax_fp16arith_params,
544
+ /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
545
+ benchmark::utils::CheckNEONFP16ARITH);
546
+ }
547
+ static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
548
+ f16_igemm(state,
549
+ xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
550
+ xnn_init_f16_minmax_fp16arith_params,
551
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
552
+ benchmark::utils::CheckNEONFP16ARITH);
553
+ }
554
+ static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
555
+ f16_igemm(state,
556
+ xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
557
+ xnn_init_f16_minmax_fp16arith_params,
558
+ /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
559
+ benchmark::utils::CheckNEONFP16ARITH);
560
+ }
561
+ static void f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
562
+ f16_igemm(state,
563
+ xnn_generate_f16_igemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
564
+ xnn_init_f16_minmax_fp16arith_params,
565
+ /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
566
+ benchmark::utils::CheckNEONFP16ARITH);
567
+ }
568
+ static void f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
569
+ f16_igemm(state,
570
+ xnn_generate_f16_igemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
571
+ xnn_init_f16_minmax_fp16arith_params,
572
+ /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
573
+ benchmark::utils::CheckNEONFP16ARITH);
574
+ }
575
+
576
+ BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
577
+ BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
578
+ BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
579
+ BENCHMARK_CONV(f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0)
580
+ BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
581
+ BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
582
+ BENCHMARK_CONV(f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
583
+ BENCHMARK_CONV(f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
584
+ #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
585
+
586
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
587
+ BENCHMARK_MAIN();
588
+ #endif
bench/f16-raddstoreexpminusmax.cc ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/raddstoreexpminusmax.h>
22
+ #include <xnnpack/rmax.h>
23
+
24
+
25
+ static void f16_raddstoreexpminusmax(
26
+ benchmark::State& state,
27
+ xnn_f16_rmax_ukernel_fn rmax,
28
+ xnn_f16_raddstoreexpminusmax_ukernel_fn raddstoreexpminusmax,
29
+ xnn_init_f16_expminus_params_fn init_params,
30
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
31
+ {
32
+ if (isa_check && !isa_check(state)) {
33
+ return;
34
+ }
35
+
36
+ const size_t elements = state.range(0);
37
+ const size_t cache_line_size_max = 128;
38
+ const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(uint16_t));
39
+
40
+ std::random_device random_device;
41
+ auto rng = std::mt19937(random_device());
42
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-100.0f, 100.0f), std::ref(rng));
43
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
44
+
45
+ const size_t num_buffers = 1 +
46
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(uint16_t));
47
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
48
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(packed_elements * num_buffers);
49
+
50
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
51
+
52
+ benchmark::utils::DisableDenormals();
53
+
54
+ xnn_f16_expminus_params params;
55
+ init_params(&params);
56
+
57
+ size_t buffer_index = 0;
58
+ for (auto _ : state) {
59
+ state.PauseTiming();
60
+ uint16_t x_max = UINT16_C(0x7E00) /* NaN */;
61
+ rmax(elements * sizeof(uint16_t), x.data(), &x_max);
62
+ if (++buffer_index == num_buffers) {
63
+ buffer_index = 0;
64
+ }
65
+ state.ResumeTiming();
66
+
67
+ uint16_t y_sum = UINT16_C(0x7E00) /* NaN */;
68
+ raddstoreexpminusmax(elements * sizeof(uint16_t), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, &params);
69
+ }
70
+
71
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
72
+ if (cpu_frequency != 0) {
73
+ state.counters["cpufreq"] = cpu_frequency;
74
+ }
75
+
76
+ const size_t elements_per_iteration = elements;
77
+ state.counters["elements"] =
78
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
79
+
80
+ const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
81
+ state.counters["bytes"] =
82
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
83
+ }
84
+
85
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
86
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32,
87
+ xnn_f16_rmax_ukernel__neonfp16arith,
88
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
89
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
90
+ benchmark::utils::CheckNEONFP16ARITH)
91
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92
+ ->UseRealTime();
93
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc2,
94
+ xnn_f16_rmax_ukernel__neonfp16arith,
95
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2,
96
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
97
+ benchmark::utils::CheckNEONFP16ARITH)
98
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
99
+ ->UseRealTime();
100
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc4,
101
+ xnn_f16_rmax_ukernel__neonfp16arith,
102
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4,
103
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
104
+ benchmark::utils::CheckNEONFP16ARITH)
105
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
106
+ ->UseRealTime();
107
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40,
108
+ xnn_f16_rmax_ukernel__neonfp16arith,
109
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
110
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
111
+ benchmark::utils::CheckNEONFP16ARITH)
112
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
113
+ ->UseRealTime();
114
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc2,
115
+ xnn_f16_rmax_ukernel__neonfp16arith,
116
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2,
117
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
118
+ benchmark::utils::CheckNEONFP16ARITH)
119
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
120
+ ->UseRealTime();
121
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc5,
122
+ xnn_f16_rmax_ukernel__neonfp16arith,
123
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5,
124
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
125
+ benchmark::utils::CheckNEONFP16ARITH)
126
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
127
+ ->UseRealTime();
128
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48,
129
+ xnn_f16_rmax_ukernel__neonfp16arith,
130
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48,
131
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
132
+ benchmark::utils::CheckNEONFP16ARITH)
133
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
134
+ ->UseRealTime();
135
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc2,
136
+ xnn_f16_rmax_ukernel__neonfp16arith,
137
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc2,
138
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
139
+ benchmark::utils::CheckNEONFP16ARITH)
140
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141
+ ->UseRealTime();
142
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc3,
143
+ xnn_f16_rmax_ukernel__neonfp16arith,
144
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3,
145
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
146
+ benchmark::utils::CheckNEONFP16ARITH)
147
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
148
+ ->UseRealTime();
149
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64,
150
+ xnn_f16_rmax_ukernel__neonfp16arith,
151
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64,
152
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
153
+ benchmark::utils::CheckNEONFP16ARITH)
154
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
155
+ ->UseRealTime();
156
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc2,
157
+ xnn_f16_rmax_ukernel__neonfp16arith,
158
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc2,
159
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
160
+ benchmark::utils::CheckNEONFP16ARITH)
161
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
162
+ ->UseRealTime();
163
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc4,
164
+ xnn_f16_rmax_ukernel__neonfp16arith,
165
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc4,
166
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
167
+ benchmark::utils::CheckNEONFP16ARITH)
168
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
169
+ ->UseRealTime();
170
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72,
171
+ xnn_f16_rmax_ukernel__neonfp16arith,
172
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72,
173
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
174
+ benchmark::utils::CheckNEONFP16ARITH)
175
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
176
+ ->UseRealTime();
177
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72_acc3,
178
+ xnn_f16_rmax_ukernel__neonfp16arith,
179
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72_acc3,
180
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
181
+ benchmark::utils::CheckNEONFP16ARITH)
182
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
183
+ ->UseRealTime();
184
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80,
185
+ xnn_f16_rmax_ukernel__neonfp16arith,
186
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80,
187
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
188
+ benchmark::utils::CheckNEONFP16ARITH)
189
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190
+ ->UseRealTime();
191
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc2,
192
+ xnn_f16_rmax_ukernel__neonfp16arith,
193
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc2,
194
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
195
+ benchmark::utils::CheckNEONFP16ARITH)
196
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
197
+ ->UseRealTime();
198
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc5,
199
+ xnn_f16_rmax_ukernel__neonfp16arith,
200
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc5,
201
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
202
+ benchmark::utils::CheckNEONFP16ARITH)
203
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
204
+ ->UseRealTime();
205
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96,
206
+ xnn_f16_rmax_ukernel__neonfp16arith,
207
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96,
208
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
209
+ benchmark::utils::CheckNEONFP16ARITH)
210
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
211
+ ->UseRealTime();
212
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc2,
213
+ xnn_f16_rmax_ukernel__neonfp16arith,
214
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2,
215
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
216
+ benchmark::utils::CheckNEONFP16ARITH)
217
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
218
+ ->UseRealTime();
219
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc3,
220
+ xnn_f16_rmax_ukernel__neonfp16arith,
221
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3,
222
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
223
+ benchmark::utils::CheckNEONFP16ARITH)
224
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
225
+ ->UseRealTime();
226
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc6,
227
+ xnn_f16_rmax_ukernel__neonfp16arith,
228
+ xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6,
229
+ xnn_init_f16_expminus_fp16arith_rr2_p2_params,
230
+ benchmark::utils::CheckNEONFP16ARITH)
231
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
232
+ ->UseRealTime();
233
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
234
+
235
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
236
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32,
237
+ xnn_f16_rmax_ukernel__f16c,
238
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32,
239
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
240
+ benchmark::utils::CheckAVX2)
241
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
242
+ ->UseRealTime();
243
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc2,
244
+ xnn_f16_rmax_ukernel__f16c,
245
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc2,
246
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
247
+ benchmark::utils::CheckAVX2)
248
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
249
+ ->UseRealTime();
250
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc4,
251
+ xnn_f16_rmax_ukernel__f16c,
252
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc4,
253
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
254
+ benchmark::utils::CheckAVX2)
255
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
256
+ ->UseRealTime();
257
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40,
258
+ xnn_f16_rmax_ukernel__f16c,
259
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
260
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
261
+ benchmark::utils::CheckAVX2)
262
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
263
+ ->UseRealTime();
264
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc2,
265
+ xnn_f16_rmax_ukernel__f16c,
266
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc2,
267
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
268
+ benchmark::utils::CheckAVX2)
269
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
270
+ ->UseRealTime();
271
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc5,
272
+ xnn_f16_rmax_ukernel__f16c,
273
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc5,
274
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
275
+ benchmark::utils::CheckAVX2)
276
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
277
+ ->UseRealTime();
278
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48,
279
+ xnn_f16_rmax_ukernel__f16c,
280
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48,
281
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
282
+ benchmark::utils::CheckAVX2)
283
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284
+ ->UseRealTime();
285
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc2,
286
+ xnn_f16_rmax_ukernel__f16c,
287
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc2,
288
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
289
+ benchmark::utils::CheckAVX2)
290
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
291
+ ->UseRealTime();
292
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc3,
293
+ xnn_f16_rmax_ukernel__f16c,
294
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc3,
295
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
296
+ benchmark::utils::CheckAVX2)
297
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
298
+ ->UseRealTime();
299
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64,
300
+ xnn_f16_rmax_ukernel__f16c,
301
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64,
302
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
303
+ benchmark::utils::CheckAVX2)
304
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
305
+ ->UseRealTime();
306
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc2,
307
+ xnn_f16_rmax_ukernel__f16c,
308
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2,
309
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
310
+ benchmark::utils::CheckAVX2)
311
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
312
+ ->UseRealTime();
313
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc4,
314
+ xnn_f16_rmax_ukernel__f16c,
315
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4,
316
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
317
+ benchmark::utils::CheckAVX2)
318
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
319
+ ->UseRealTime();
320
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72,
321
+ xnn_f16_rmax_ukernel__f16c,
322
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72,
323
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
324
+ benchmark::utils::CheckAVX2)
325
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
326
+ ->UseRealTime();
327
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72_acc3,
328
+ xnn_f16_rmax_ukernel__f16c,
329
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3,
330
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
331
+ benchmark::utils::CheckAVX2)
332
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
333
+ ->UseRealTime();
334
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80,
335
+ xnn_f16_rmax_ukernel__f16c,
336
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80,
337
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
338
+ benchmark::utils::CheckAVX2)
339
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
340
+ ->UseRealTime();
341
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc2,
342
+ xnn_f16_rmax_ukernel__f16c,
343
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2,
344
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
345
+ benchmark::utils::CheckAVX2)
346
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
347
+ ->UseRealTime();
348
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc5,
349
+ xnn_f16_rmax_ukernel__f16c,
350
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5,
351
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
352
+ benchmark::utils::CheckAVX2)
353
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
354
+ ->UseRealTime();
355
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96,
356
+ xnn_f16_rmax_ukernel__f16c,
357
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96,
358
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
359
+ benchmark::utils::CheckAVX2)
360
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
361
+ ->UseRealTime();
362
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc2,
363
+ xnn_f16_rmax_ukernel__f16c,
364
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2,
365
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
366
+ benchmark::utils::CheckAVX2)
367
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
368
+ ->UseRealTime();
369
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc3,
370
+ xnn_f16_rmax_ukernel__f16c,
371
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3,
372
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
373
+ benchmark::utils::CheckAVX2)
374
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
375
+ ->UseRealTime();
376
+ BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc6,
377
+ xnn_f16_rmax_ukernel__f16c,
378
+ xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6,
379
+ xnn_init_f16_expminus_avx2_rr1_p2_params,
380
+ benchmark::utils::CheckAVX2)
381
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
382
+ ->UseRealTime();
383
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
384
+
385
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
386
+ BENCHMARK_MAIN();
387
+ #endif
bench/f16-rsum.cc ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/reduce.h>
22
+
23
+
24
+ static void f16_rsum(
25
+ benchmark::State& state,
26
+ xnn_f16_rsum_ukernel_fn rsum,
27
+ xnn_init_f16_scale_params_fn init_params,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check != nullptr && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
42
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
43
+
44
+ xnn_f16_scale_params params;
45
+ init_params(&params, /*scale=*/fp16_ieee_from_fp32_value(0.1f));
46
+
47
+ uint16_t output = UINT16_C(0x7E00); /* NaN */
48
+ for (auto _ : state) {
49
+ rsum(elements * sizeof(uint16_t), input.data(), &output, &params);
50
+ }
51
+
52
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
53
+ if (cpu_frequency != 0) {
54
+ state.counters["cpufreq"] = cpu_frequency;
55
+ }
56
+
57
+ const size_t elements_per_iteration = elements;
58
+ state.counters["elements"] =
59
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
60
+
61
+ const size_t bytes_per_iteration = elements * sizeof(uint16_t);
62
+ state.counters["bytes"] =
63
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
64
+ }
65
+
66
+ #if XNN_ARCH_ARM || XNN_ARCH_ARM64
67
+ BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x8,
68
+ xnn_f16_rsum_ukernel__neonfp16arith_x8,
69
+ xnn_init_f16_scale_fp16arith_params,
70
+ benchmark::utils::CheckNEONFP16ARITH)
71
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
72
+ ->UseRealTime();
73
+ BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x16_acc2,
74
+ xnn_f16_rsum_ukernel__neonfp16arith_x16_acc2,
75
+ xnn_init_f16_scale_fp16arith_params,
76
+ benchmark::utils::CheckNEONFP16ARITH)
77
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
78
+ ->UseRealTime();
79
+ BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x24_acc3,
80
+ xnn_f16_rsum_ukernel__neonfp16arith_x24_acc3,
81
+ xnn_init_f16_scale_fp16arith_params,
82
+ benchmark::utils::CheckNEONFP16ARITH)
83
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
84
+ ->UseRealTime();
85
+ BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc2,
86
+ xnn_f16_rsum_ukernel__neonfp16arith_x32_acc2,
87
+ xnn_init_f16_scale_fp16arith_params,
88
+ benchmark::utils::CheckNEONFP16ARITH)
89
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
90
+ ->UseRealTime();
91
+ BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc4,
92
+ xnn_f16_rsum_ukernel__neonfp16arith_x32_acc4,
93
+ xnn_init_f16_scale_fp16arith_params,
94
+ benchmark::utils::CheckNEONFP16ARITH)
95
+ ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
96
+ ->UseRealTime();
97
+ #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
98
+
99
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
100
+ BENCHMARK_MAIN();
101
+ #endif
bench/f16-spmm.cc ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2019 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cassert>
8
+ #include <cmath>
9
+ #include <cstddef>
10
+ #include <cstdlib>
11
+ #include <random>
12
+ #include <vector>
13
+
14
+ #include <benchmark/benchmark.h>
15
+ #include <fp16/fp16.h>
16
+ #include "bench/spmm.h"
17
+ #include "bench/utils.h"
18
+
19
+ #include <xnnpack.h>
20
+ #include <xnnpack/aligned-allocator.h>
21
+ #include <xnnpack/common.h>
22
+ #include <xnnpack/microfnptr.h>
23
+ #include <xnnpack/microparams-init.h>
24
+ #include <xnnpack/spmm.h>
25
+
26
+ static inline bool is_fp16_zero(uint16_t x) {
27
+ const uint16_t two_x = x + x;
28
+ return two_x == 0;
29
+ }
30
+
31
+ static void f16_spmm(benchmark::State& state,
32
+ xnn_f16_spmm_minmax_ukernel_fn spmm, uint32_t mr, uint32_t nr, float sparsity,
33
+ xnn_init_f16_minmax_params_fn init_params,
34
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
35
+ {
36
+ if (isa_check && !isa_check(state)) {
37
+ return;
38
+ }
39
+ const size_t mc = state.range(0);
40
+ const size_t nc = state.range(1);
41
+ const size_t kc = state.range(2);
42
+
43
+ std::random_device random_device;
44
+ auto rng = std::mt19937(random_device());
45
+ std::uniform_real_distribution<float> f32dist;
46
+ std::uniform_real_distribution<float> pdist;
47
+
48
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(kc * mc);
49
+ // Think of b as (n/nr + n % nr) x k, expansion happens later.
50
+ const size_t ncols = nc / nr + nc % nr;
51
+ std::vector<uint16_t> b(ncols * kc);
52
+ std::vector<uint16_t> bias(nc);
53
+ // Number of non-zero weights per N (output channel).
54
+ std::vector<uint32_t> nmap(nc);
55
+ // Mapping from index of non-zero weight to increment of K (input channel) following this index.
56
+ std::vector<int32_t> dmap(nc * kc);
57
+ std::vector<uint16_t> w(nc * kc + nc);
58
+ std::vector<uint16_t> output(nc * mc);
59
+
60
+ std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
61
+ std::generate(b.begin(), b.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
62
+ std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
63
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
64
+ std::fill(nmap.begin(), nmap.end(), 0);
65
+ std::fill(dmap.begin(), dmap.end(), 0);
66
+ std::fill(w.begin(), w.end(), 0);
67
+
68
+ for (uint16_t& b_value : b) {
69
+ if (pdist(rng) <= sparsity) {
70
+ b_value = 0;
71
+ }
72
+ }
73
+
74
+ uint32_t nnz = 0;
75
+ uint32_t wcnt = 0;
76
+ size_t last_kk = 0;
77
+ bool first_nzz = true;
78
+ size_t first_kk = 0;
79
+ for (size_t nn = 0; nn < nc / nr; nn++) {
80
+ for (size_t i = 0; i < nr; ++i)
81
+ w[wcnt++] = bias[nr * nn + i];
82
+ for (size_t kk = 0; kk < kc; kk++) {
83
+ if (!is_fp16_zero(b[nn * kc + kk])) {
84
+ // Every non-zero actually corresponds to nr adjacent non-zeros.
85
+ for (size_t i = 0; i < nr; ++i)
86
+ w[wcnt++] = fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
87
+ // Skip the very first non-zero weight as we record only the difference.
88
+ if (first_nzz) {
89
+ first_kk = kk;
90
+ } else {
91
+ const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
92
+ dmap[nnz++] = increment;
93
+ }
94
+ last_kk = kk;
95
+ first_nzz = false;
96
+ nmap[nn] += 1;
97
+ }
98
+ }
99
+ }
100
+
101
+ // now we've constructed the matrix for the blocked part and switch to the
102
+ // leftovers, which we do as nr=1 always.
103
+ for (size_t nn = nc / nr; nn < ncols; nn++) {
104
+ w[wcnt++] = bias[(nc / nr) * nr + (nn - nc / nr)];
105
+ for (size_t kk = 0; kk < kc; kk++) {
106
+ if (!is_fp16_zero(b[nn * kc + kk])) {
107
+ // Every non-zero actually corresponds to nr adjacent non-zeros.
108
+ w[wcnt++] = b[nn * kc + kk];
109
+ // Skip the very first non-zero weight as we record only the difference.
110
+ if (first_nzz) {
111
+ first_kk = kk;
112
+ } else {
113
+ const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
114
+ dmap[nnz++] = increment;
115
+ }
116
+ last_kk = kk;
117
+ first_nzz = false;
118
+ nmap[nn] += 1;
119
+ }
120
+ }
121
+ }
122
+ // In the end, we must return input pointer to the initial value.
123
+ const int64_t increment = int32_t(first_kk - last_kk) * int32_t(mc * sizeof(uint16_t));
124
+ dmap[nnz++] = increment;
125
+
126
+ // Generate expanded b which will be used in reference calculation.
127
+ // Everywhere there is input non-zero in the original we copy it and add an
128
+ // adjacent non-zero with incremented weight value.
129
+ std::vector<uint16_t> b_full(nc * kc);
130
+ if (nr == 1) {
131
+ b_full = b;
132
+ }
133
+ else {
134
+ for (size_t nn = 0; nn < nc / nr; nn++) {
135
+ for (size_t kk = 0; kk < kc; kk++) {
136
+ if (b[nn * kc + kk] != 0.0f) {
137
+ for (size_t i = 0; i < nr; ++i)
138
+ b_full[nr * nn * kc + i * kc + kk] = fp16_ieee_from_fp32_value(
139
+ fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
140
+ }
141
+ }
142
+ }
143
+ for (size_t nn = nc / nr; nn < ncols; nn++) {
144
+ for (size_t kk = 0; kk < kc; kk++) {
145
+ if (b[nn * kc + kk] != 0.0f) {
146
+ b_full[nr * (nc / nr) * kc + (nn - nc / nr) * kc + kk] = b[nn * kc + kk];
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ // Micro-kernel can access one element beyond w and dmap for software pipelining.
153
+ w.resize(wcnt + 1);
154
+ dmap.resize(nnz + 1);
155
+
156
+ // Prepare parameters.
157
+ xnn_f16_minmax_params params;
158
+ init_params(&params, 0xFC00 /* -inf */, 0x7C00 /* inf */);
159
+
160
+ for (auto _ : state) {
161
+
162
+ spmm(mc * sizeof(uint16_t), nc,
163
+ input.data() + first_kk * mc,
164
+ w.data(), dmap.data(), nmap.data(),
165
+ output.data(), mc * sizeof(uint16_t),
166
+ &params);
167
+ }
168
+
169
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
170
+ if (cpu_frequency != 0) {
171
+ state.counters["cpufreq"] = cpu_frequency;
172
+ }
173
+
174
+ state.counters["FLOPS"] = benchmark::Counter(
175
+ uint64_t(state.iterations()) * 2 * mc * nnz, benchmark::Counter::kIsRate);
176
+
177
+ state.counters["EffFLOPS"] = benchmark::Counter(
178
+ uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
179
+ }
180
+
181
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
182
+ static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
183
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f,
184
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
185
+ }
186
+ static void spmm80_8x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
187
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_pipelined, 8, 1, 0.8f,
188
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
189
+ }
190
+ static void spmm80_8x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
191
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_x2, 8, 1, 0.8f,
192
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
193
+ }
194
+ static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
195
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f,
196
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
197
+ }
198
+ static void spmm80_16x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
199
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_pipelined, 16, 1, 0.8f,
200
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
201
+ }
202
+ static void spmm80_16x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
203
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_x2, 16, 1, 0.8f,
204
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
205
+ }
206
+ static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
207
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f,
208
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
209
+ }
210
+ static void spmm80_24x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
211
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_pipelined, 24, 1, 0.8f,
212
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
213
+ }
214
+ static void spmm80_24x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
215
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_x2, 24, 1, 0.8f,
216
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
217
+ }
218
+ static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
219
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f,
220
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
221
+ }
222
+ static void spmm80_32x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
223
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_pipelined, 32, 1, 0.8f,
224
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
225
+ }
226
+ static void spmm80_32x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
227
+ f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_x2, 32, 1, 0.8f,
228
+ xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
229
+ }
230
+
231
+ BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_pipelined)
232
+ BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_pipelined)
233
+ BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_pipelined)
234
+ BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_pipelined)
235
+ BENCHMARK_SPMM(spmm80_8x1__neonfp16arith)
236
+ BENCHMARK_SPMM(spmm80_16x1__neonfp16arith)
237
+ BENCHMARK_SPMM(spmm80_24x1__neonfp16arith)
238
+ BENCHMARK_SPMM(spmm80_32x1__neonfp16arith)
239
+ BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_x2)
240
+ BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_x2)
241
+ BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_x2)
242
+ BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_x2)
243
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
244
+
245
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
246
+ BENCHMARK_MAIN();
247
+ #endif
bench/f16-velu.cc ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vunary.h>
22
+
23
+
24
+ static void f16_velu(
25
+ benchmark::State& state,
26
+ xnn_f16_velu_ukernel_fn elu,
27
+ xnn_init_f16_elu_params_fn init_params,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t num_elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-9.0f, 9.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
44
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45
+
46
+ union xnn_f16_elu_params params;
47
+ init_params(&params,
48
+ UINT16_C(0x3C00) /* prescale = 1.0h */,
49
+ UINT16_C(0x3C00) /* alpha = 1.0h */,
50
+ UINT16_C(0x3C00) /* beta = 1.0h */);
51
+ for (auto _ : state) {
52
+ elu(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
53
+ }
54
+
55
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56
+ if (cpu_frequency != 0) {
57
+ state.counters["cpufreq"] = cpu_frequency;
58
+ }
59
+
60
+ const size_t elements_per_iteration = num_elements;
61
+ state.counters["elements"] =
62
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
63
+
64
+ const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
65
+ state.counters["bytes"] =
66
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67
+ }
68
+
69
+
70
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
71
+ BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x8,
72
+ xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x8,
73
+ xnn_init_f16_elu_fp16arith_rr1_p3_params,
74
+ benchmark::utils::CheckNEONFP16ARITH)
75
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
76
+ ->UseRealTime();
77
+ BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x16,
78
+ xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
79
+ xnn_init_f16_elu_fp16arith_rr1_p3_params,
80
+ benchmark::utils::CheckNEONFP16ARITH)
81
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
82
+ ->UseRealTime();
83
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
84
+
85
+
86
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
87
+ BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x8,
88
+ xnn_f16_velu_ukernel__avx2_rr1_p3_x8,
89
+ xnn_init_f16_elu_avx2_rr1_p3_params,
90
+ benchmark::utils::CheckAVX2)
91
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92
+ ->UseRealTime();
93
+ BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x16,
94
+ xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
95
+ xnn_init_f16_elu_avx2_rr1_p3_params,
96
+ benchmark::utils::CheckAVX2)
97
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
98
+ ->UseRealTime();
99
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
100
+
101
+
102
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
103
+ BENCHMARK_MAIN();
104
+ #endif
bench/f16-vsigmoid.cc ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vunary.h>
22
+
23
+
24
+ static void f16_vsigmoid(
25
+ benchmark::State& state,
26
+ xnn_f16_vsigmoid_ukernel_fn sigmoid,
27
+ xnn_init_f16_sigmoid_params_fn init_params,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t num_elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
44
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45
+
46
+ xnn_f16_sigmoid_params params;
47
+ init_params(&params);
48
+ for (auto _ : state) {
49
+ sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
50
+ }
51
+
52
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
53
+ if (cpu_frequency != 0) {
54
+ state.counters["cpufreq"] = cpu_frequency;
55
+ }
56
+
57
+ const size_t elements_per_iteration = num_elements;
58
+ state.counters["elements"] =
59
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
60
+
61
+ const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
62
+ state.counters["bytes"] =
63
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
64
+ }
65
+
66
+ #if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
67
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x8,
68
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x8,
69
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
70
+ benchmark::utils::CheckNEONFP16ARITH)
71
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
72
+ ->UseRealTime();
73
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x16,
74
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x16,
75
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
76
+ benchmark::utils::CheckNEONFP16ARITH)
77
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
78
+ ->UseRealTime();
79
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x24,
80
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x24,
81
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
82
+ benchmark::utils::CheckNEONFP16ARITH)
83
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
84
+ ->UseRealTime();
85
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x32,
86
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x32,
87
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
88
+ benchmark::utils::CheckNEONFP16ARITH)
89
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
90
+ ->UseRealTime();
91
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x40,
92
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x40,
93
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
94
+ benchmark::utils::CheckNEONFP16ARITH)
95
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
96
+ ->UseRealTime();
97
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x48,
98
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x48,
99
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
100
+ benchmark::utils::CheckNEONFP16ARITH)
101
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
102
+ ->UseRealTime();
103
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x56,
104
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x56,
105
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
106
+ benchmark::utils::CheckNEONFP16ARITH)
107
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
108
+ ->UseRealTime();
109
+ BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x64,
110
+ xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x64,
111
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
112
+ benchmark::utils::CheckNEONFP16ARITH)
113
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
114
+ ->UseRealTime();
115
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
116
+
117
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
118
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x8,
119
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x8,
120
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
121
+ benchmark::utils::CheckNEONFP16ARITH)
122
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
123
+ ->UseRealTime();
124
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x16,
125
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x16,
126
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
127
+ benchmark::utils::CheckNEONFP16ARITH)
128
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
129
+ ->UseRealTime();
130
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x24,
131
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x24,
132
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
133
+ benchmark::utils::CheckNEONFP16ARITH)
134
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
135
+ ->UseRealTime();
136
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x32,
137
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x32,
138
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
139
+ benchmark::utils::CheckNEONFP16ARITH)
140
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141
+ ->UseRealTime();
142
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x40,
143
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
144
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
145
+ benchmark::utils::CheckNEONFP16ARITH)
146
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
147
+ ->UseRealTime();
148
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x48,
149
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x48,
150
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
151
+ benchmark::utils::CheckNEONFP16ARITH)
152
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
153
+ ->UseRealTime();
154
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x56,
155
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x56,
156
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
157
+ benchmark::utils::CheckNEONFP16ARITH)
158
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
159
+ ->UseRealTime();
160
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x64,
161
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x64,
162
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
163
+ benchmark::utils::CheckNEONFP16ARITH)
164
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
165
+ ->UseRealTime();
166
+
167
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x8,
168
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x8,
169
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
170
+ benchmark::utils::CheckNEONFP16ARITH)
171
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
172
+ ->UseRealTime();
173
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x16,
174
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
175
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
176
+ benchmark::utils::CheckNEONFP16ARITH)
177
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
178
+ ->UseRealTime();
179
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x24,
180
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x24,
181
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
182
+ benchmark::utils::CheckNEONFP16ARITH)
183
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
184
+ ->UseRealTime();
185
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x32,
186
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x32,
187
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
188
+ benchmark::utils::CheckNEONFP16ARITH)
189
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190
+ ->UseRealTime();
191
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x40,
192
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x40,
193
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
194
+ benchmark::utils::CheckNEONFP16ARITH)
195
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
196
+ ->UseRealTime();
197
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x48,
198
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x48,
199
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
200
+ benchmark::utils::CheckNEONFP16ARITH)
201
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
202
+ ->UseRealTime();
203
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x56,
204
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x56,
205
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
206
+ benchmark::utils::CheckNEONFP16ARITH)
207
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
208
+ ->UseRealTime();
209
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x64,
210
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x64,
211
+ xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
212
+ benchmark::utils::CheckNEONFP16ARITH)
213
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
214
+ ->UseRealTime();
215
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
216
+
217
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
218
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x8,
219
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x8,
220
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
221
+ benchmark::utils::CheckAVX2)
222
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
223
+ ->UseRealTime();
224
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x16,
225
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x16,
226
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
227
+ benchmark::utils::CheckAVX2)
228
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
229
+ ->UseRealTime();
230
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x24,
231
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x24,
232
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
233
+ benchmark::utils::CheckAVX2)
234
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
235
+ ->UseRealTime();
236
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x32,
237
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x32,
238
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
239
+ benchmark::utils::CheckAVX2)
240
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
241
+ ->UseRealTime();
242
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x40,
243
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x40,
244
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
245
+ benchmark::utils::CheckAVX2)
246
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
247
+ ->UseRealTime();
248
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x48,
249
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x48,
250
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
251
+ benchmark::utils::CheckAVX2)
252
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
253
+ ->UseRealTime();
254
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x56,
255
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x56,
256
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
257
+ benchmark::utils::CheckAVX2)
258
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
259
+ ->UseRealTime();
260
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x64,
261
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x64,
262
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
263
+ benchmark::utils::CheckAVX2)
264
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
265
+ ->UseRealTime();
266
+
267
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x8,
268
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x8,
269
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
270
+ benchmark::utils::CheckAVX2)
271
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
272
+ ->UseRealTime();
273
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x16,
274
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x16,
275
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
276
+ benchmark::utils::CheckAVX2)
277
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
278
+ ->UseRealTime();
279
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x24,
280
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x24,
281
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
282
+ benchmark::utils::CheckAVX2)
283
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284
+ ->UseRealTime();
285
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x32,
286
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
287
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
288
+ benchmark::utils::CheckAVX2)
289
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
290
+ ->UseRealTime();
291
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x40,
292
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x40,
293
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
294
+ benchmark::utils::CheckAVX2)
295
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
296
+ ->UseRealTime();
297
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x48,
298
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x48,
299
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
300
+ benchmark::utils::CheckAVX2)
301
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
302
+ ->UseRealTime();
303
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x56,
304
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x56,
305
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
306
+ benchmark::utils::CheckAVX2)
307
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
308
+ ->UseRealTime();
309
+ BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x64,
310
+ xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x64,
311
+ xnn_init_f16_sigmoid_avx2_rr1_p2_params,
312
+ benchmark::utils::CheckAVX2)
313
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
314
+ ->UseRealTime();
315
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
316
+
317
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
318
+ BENCHMARK_MAIN();
319
+ #endif
bench/f16-vsqrt.cc ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2022 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vunary.h>
22
+
23
+
24
+ static void f16_vsqrt(
25
+ benchmark::State& state,
26
+ xnn_f16_vsqrt_ukernel_fn sqrt,
27
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
28
+ {
29
+ if (isa_check && !isa_check(state)) {
30
+ return;
31
+ }
32
+
33
+ const size_t num_elements = state.range(0);
34
+
35
+ std::random_device random_device;
36
+ auto rng = std::mt19937(random_device());
37
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
38
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
39
+
40
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
42
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
43
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
44
+
45
+ for (auto _ : state) {
46
+ sqrt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr);
47
+ }
48
+
49
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
50
+ if (cpu_frequency != 0) {
51
+ state.counters["cpufreq"] = cpu_frequency;
52
+ }
53
+
54
+ const size_t elements_per_iteration = num_elements;
55
+ state.counters["elements"] =
56
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
57
+
58
+ const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
59
+ state.counters["bytes"] =
60
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
61
+ }
62
+
63
+
64
+ #if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
65
+ BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x8,
66
+ xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x8,
67
+ benchmark::utils::CheckNEONFP16ARITH)
68
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
69
+ ->UseRealTime();
70
+ BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x16,
71
+ xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x16,
72
+ benchmark::utils::CheckNEONFP16ARITH)
73
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
74
+ ->UseRealTime();
75
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
76
+
77
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
78
+ BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x8,
79
+ xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x8,
80
+ benchmark::utils::CheckNEONFP16ARITH)
81
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
82
+ ->UseRealTime();
83
+ BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x16,
84
+ xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x16,
85
+ benchmark::utils::CheckNEONFP16ARITH)
86
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
87
+ ->UseRealTime();
88
+ BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x24,
89
+ xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x24,
90
+ benchmark::utils::CheckNEONFP16ARITH)
91
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
92
+ ->UseRealTime();
93
+ BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x32,
94
+ xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x32,
95
+ benchmark::utils::CheckNEONFP16ARITH)
96
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
97
+ ->UseRealTime();
98
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
99
+
100
+ #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
101
+ BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x1,
102
+ xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x1,
103
+ benchmark::utils::CheckFP16ARITH)
104
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
105
+ ->UseRealTime();
106
+ BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x2,
107
+ xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x2,
108
+ benchmark::utils::CheckFP16ARITH)
109
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
110
+ ->UseRealTime();
111
+ BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x4,
112
+ xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x4,
113
+ benchmark::utils::CheckFP16ARITH)
114
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
115
+ ->UseRealTime();
116
+ #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
117
+
118
+
119
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
120
+ BENCHMARK_MAIN();
121
+ #endif
bench/f16-vtanh.cc ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2023 Google LLC
2
+ //
3
+ // This source code is licensed under the BSD-style license found in the
4
+ // LICENSE file in the root directory of this source tree.
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <functional>
9
+ #include <random>
10
+ #include <vector>
11
+
12
+ #include <benchmark/benchmark.h>
13
+ #include <fp16/fp16.h>
14
+ #include "bench/utils.h"
15
+
16
+ #include <xnnpack.h>
17
+ #include <xnnpack/aligned-allocator.h>
18
+ #include <xnnpack/common.h>
19
+ #include <xnnpack/microfnptr.h>
20
+ #include <xnnpack/microparams-init.h>
21
+ #include <xnnpack/vunary.h>
22
+
23
+
24
+ static void f16_vtanh(
25
+ benchmark::State& state,
26
+ xnn_f16_vtanh_ukernel_fn tanh,
27
+ xnn_init_f16_tanh_params_fn init_params = nullptr,
28
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
29
+ {
30
+ if (isa_check != nullptr && !isa_check(state)) {
31
+ return;
32
+ }
33
+
34
+ const size_t num_elements = state.range(0);
35
+
36
+ std::random_device random_device;
37
+ auto rng = std::mt19937(random_device());
38
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.0f, 5.0f), std::ref(rng));
39
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
+
41
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
44
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45
+
46
+ xnn_f16_tanh_params params;
47
+ if (init_params != nullptr) {
48
+ init_params(&params);
49
+ }
50
+ for (auto _ : state) {
51
+ tanh(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
52
+ }
53
+
54
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
55
+ if (cpu_frequency != 0) {
56
+ state.counters["cpufreq"] = cpu_frequency;
57
+ }
58
+
59
+ const size_t elements_per_iteration = num_elements;
60
+ state.counters["elements"] =
61
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
62
+
63
+ const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
64
+ state.counters["bytes"] =
65
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
66
+ }
67
+
68
+
69
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
70
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x8,
71
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x8,
72
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
73
+ benchmark::utils::CheckAVX2)
74
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
75
+ ->UseRealTime();
76
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x16,
77
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x16,
78
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
79
+ benchmark::utils::CheckAVX2)
80
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
81
+ ->UseRealTime();
82
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x24,
83
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x24,
84
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
85
+ benchmark::utils::CheckAVX2)
86
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
87
+ ->UseRealTime();
88
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x32,
89
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x32,
90
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
91
+ benchmark::utils::CheckAVX2)
92
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
93
+ ->UseRealTime();
94
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x40,
95
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x40,
96
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
97
+ benchmark::utils::CheckAVX2)
98
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
99
+ ->UseRealTime();
100
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x48,
101
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x48,
102
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
103
+ benchmark::utils::CheckAVX2)
104
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
105
+ ->UseRealTime();
106
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x56,
107
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x56,
108
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
109
+ benchmark::utils::CheckAVX2)
110
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
111
+ ->UseRealTime();
112
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x64,
113
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x64,
114
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
115
+ benchmark::utils::CheckAVX2)
116
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
117
+ ->UseRealTime();
118
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x72,
119
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x72,
120
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
121
+ benchmark::utils::CheckAVX2)
122
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
123
+ ->UseRealTime();
124
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x80,
125
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x80,
126
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
127
+ benchmark::utils::CheckAVX2)
128
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
129
+ ->UseRealTime();
130
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x8,
131
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x8,
132
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
133
+ benchmark::utils::CheckAVX2)
134
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
135
+ ->UseRealTime();
136
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x16,
137
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x16,
138
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
139
+ benchmark::utils::CheckAVX2)
140
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141
+ ->UseRealTime();
142
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x24,
143
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x24,
144
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
145
+ benchmark::utils::CheckAVX2)
146
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
147
+ ->UseRealTime();
148
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x32,
149
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x32,
150
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
151
+ benchmark::utils::CheckAVX2)
152
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
153
+ ->UseRealTime();
154
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x40,
155
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x40,
156
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
157
+ benchmark::utils::CheckAVX2)
158
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
159
+ ->UseRealTime();
160
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x48,
161
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x48,
162
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
163
+ benchmark::utils::CheckAVX2)
164
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
165
+ ->UseRealTime();
166
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x56,
167
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x56,
168
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
169
+ benchmark::utils::CheckAVX2)
170
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
171
+ ->UseRealTime();
172
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x64,
173
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x64,
174
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
175
+ benchmark::utils::CheckAVX2)
176
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
177
+ ->UseRealTime();
178
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x72,
179
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x72,
180
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
181
+ benchmark::utils::CheckAVX2)
182
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
183
+ ->UseRealTime();
184
+ BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x80,
185
+ xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x80,
186
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
187
+ benchmark::utils::CheckAVX2)
188
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
189
+ ->UseRealTime();
190
+
191
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x8,
192
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x8,
193
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
194
+ benchmark::utils::CheckFMA3)
195
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
196
+ ->UseRealTime();
197
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x16,
198
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x16,
199
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
200
+ benchmark::utils::CheckFMA3)
201
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
202
+ ->UseRealTime();
203
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x24,
204
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x24,
205
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
206
+ benchmark::utils::CheckFMA3)
207
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
208
+ ->UseRealTime();
209
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x32,
210
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x32,
211
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
212
+ benchmark::utils::CheckFMA3)
213
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
214
+ ->UseRealTime();
215
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x40,
216
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x40,
217
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
218
+ benchmark::utils::CheckFMA3)
219
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
220
+ ->UseRealTime();
221
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x48,
222
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x48,
223
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
224
+ benchmark::utils::CheckFMA3)
225
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
226
+ ->UseRealTime();
227
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x56,
228
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x56,
229
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
230
+ benchmark::utils::CheckFMA3)
231
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
232
+ ->UseRealTime();
233
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x64,
234
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x64,
235
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
236
+ benchmark::utils::CheckFMA3)
237
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
238
+ ->UseRealTime();
239
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x72,
240
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x72,
241
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
242
+ benchmark::utils::CheckFMA3)
243
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
244
+ ->UseRealTime();
245
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x80,
246
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x80,
247
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
248
+ benchmark::utils::CheckFMA3)
249
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
250
+ ->UseRealTime();
251
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x8,
252
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x8,
253
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
254
+ benchmark::utils::CheckFMA3)
255
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
256
+ ->UseRealTime();
257
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x16,
258
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x16,
259
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
260
+ benchmark::utils::CheckFMA3)
261
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
262
+ ->UseRealTime();
263
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x24,
264
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x24,
265
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
266
+ benchmark::utils::CheckFMA3)
267
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
268
+ ->UseRealTime();
269
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x32,
270
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x32,
271
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
272
+ benchmark::utils::CheckFMA3)
273
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
274
+ ->UseRealTime();
275
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x40,
276
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x40,
277
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
278
+ benchmark::utils::CheckFMA3)
279
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
280
+ ->UseRealTime();
281
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x48,
282
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x48,
283
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
284
+ benchmark::utils::CheckFMA3)
285
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
286
+ ->UseRealTime();
287
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x56,
288
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x56,
289
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
290
+ benchmark::utils::CheckFMA3)
291
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
292
+ ->UseRealTime();
293
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x64,
294
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x64,
295
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
296
+ benchmark::utils::CheckFMA3)
297
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
298
+ ->UseRealTime();
299
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x72,
300
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x72,
301
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
302
+ benchmark::utils::CheckFMA3)
303
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
304
+ ->UseRealTime();
305
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x80,
306
+ xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x80,
307
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
308
+ benchmark::utils::CheckFMA3)
309
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
310
+ ->UseRealTime();
311
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x8,
312
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x8,
313
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
314
+ benchmark::utils::CheckFMA3)
315
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
316
+ ->UseRealTime();
317
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x16,
318
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x16,
319
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
320
+ benchmark::utils::CheckFMA3)
321
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
322
+ ->UseRealTime();
323
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x24,
324
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x24,
325
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
326
+ benchmark::utils::CheckFMA3)
327
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
328
+ ->UseRealTime();
329
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x32,
330
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x32,
331
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
332
+ benchmark::utils::CheckFMA3)
333
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
334
+ ->UseRealTime();
335
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x40,
336
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x40,
337
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
338
+ benchmark::utils::CheckFMA3)
339
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
340
+ ->UseRealTime();
341
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x48,
342
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x48,
343
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
344
+ benchmark::utils::CheckFMA3)
345
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
346
+ ->UseRealTime();
347
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x56,
348
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x56,
349
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
350
+ benchmark::utils::CheckFMA3)
351
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
352
+ ->UseRealTime();
353
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x64,
354
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x64,
355
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
356
+ benchmark::utils::CheckFMA3)
357
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
358
+ ->UseRealTime();
359
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x72,
360
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x72,
361
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
362
+ benchmark::utils::CheckFMA3)
363
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
364
+ ->UseRealTime();
365
+ BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x80,
366
+ xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x80,
367
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
368
+ benchmark::utils::CheckFMA3)
369
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
370
+ ->UseRealTime();
371
+
372
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x8,
373
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x8,
374
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
375
+ benchmark::utils::CheckF16C)
376
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
377
+ ->UseRealTime();
378
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x16,
379
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x16,
380
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
381
+ benchmark::utils::CheckF16C)
382
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
383
+ ->UseRealTime();
384
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x24,
385
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x24,
386
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
387
+ benchmark::utils::CheckF16C)
388
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
389
+ ->UseRealTime();
390
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x32,
391
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x32,
392
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
393
+ benchmark::utils::CheckF16C)
394
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
395
+ ->UseRealTime();
396
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x40,
397
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x40,
398
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
399
+ benchmark::utils::CheckF16C)
400
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
401
+ ->UseRealTime();
402
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x48,
403
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x48,
404
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
405
+ benchmark::utils::CheckF16C)
406
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
407
+ ->UseRealTime();
408
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x56,
409
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x56,
410
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
411
+ benchmark::utils::CheckF16C)
412
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
413
+ ->UseRealTime();
414
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x64,
415
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x64,
416
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
417
+ benchmark::utils::CheckF16C)
418
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
419
+ ->UseRealTime();
420
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x72,
421
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x72,
422
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
423
+ benchmark::utils::CheckF16C)
424
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
425
+ ->UseRealTime();
426
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x80,
427
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x80,
428
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
429
+ benchmark::utils::CheckF16C)
430
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
431
+ ->UseRealTime();
432
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x8,
433
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x8,
434
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
435
+ benchmark::utils::CheckF16C)
436
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
437
+ ->UseRealTime();
438
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x16,
439
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x16,
440
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
441
+ benchmark::utils::CheckF16C)
442
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
443
+ ->UseRealTime();
444
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x24,
445
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x24,
446
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
447
+ benchmark::utils::CheckF16C)
448
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
449
+ ->UseRealTime();
450
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x32,
451
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x32,
452
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
453
+ benchmark::utils::CheckF16C)
454
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
455
+ ->UseRealTime();
456
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x40,
457
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x40,
458
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
459
+ benchmark::utils::CheckF16C)
460
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
461
+ ->UseRealTime();
462
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x48,
463
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x48,
464
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
465
+ benchmark::utils::CheckF16C)
466
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
467
+ ->UseRealTime();
468
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x56,
469
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x56,
470
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
471
+ benchmark::utils::CheckF16C)
472
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
473
+ ->UseRealTime();
474
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x64,
475
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x64,
476
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
477
+ benchmark::utils::CheckF16C)
478
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
479
+ ->UseRealTime();
480
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x72,
481
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x72,
482
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
483
+ benchmark::utils::CheckF16C)
484
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
485
+ ->UseRealTime();
486
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x80,
487
+ xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x80,
488
+ xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
489
+ benchmark::utils::CheckF16C)
490
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
491
+ ->UseRealTime();
492
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x8,
493
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x8,
494
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
495
+ benchmark::utils::CheckF16C)
496
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
497
+ ->UseRealTime();
498
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x16,
499
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x16,
500
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
501
+ benchmark::utils::CheckF16C)
502
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
503
+ ->UseRealTime();
504
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x24,
505
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x24,
506
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
507
+ benchmark::utils::CheckF16C)
508
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
509
+ ->UseRealTime();
510
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x32,
511
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x32,
512
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
513
+ benchmark::utils::CheckF16C)
514
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
515
+ ->UseRealTime();
516
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x40,
517
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x40,
518
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
519
+ benchmark::utils::CheckF16C)
520
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
521
+ ->UseRealTime();
522
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x48,
523
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x48,
524
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
525
+ benchmark::utils::CheckF16C)
526
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
527
+ ->UseRealTime();
528
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x56,
529
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x56,
530
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
531
+ benchmark::utils::CheckF16C)
532
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
533
+ ->UseRealTime();
534
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x64,
535
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x64,
536
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
537
+ benchmark::utils::CheckF16C)
538
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
539
+ ->UseRealTime();
540
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x72,
541
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x72,
542
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
543
+ benchmark::utils::CheckF16C)
544
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
545
+ ->UseRealTime();
546
+ BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x80,
547
+ xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x80,
548
+ xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
549
+ benchmark::utils::CheckF16C)
550
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
551
+ ->UseRealTime();
552
+ #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
553
+
554
+ #if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
555
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
556
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
557
+ nullptr,
558
+ benchmark::utils::CheckNEONFP16ARITH)
559
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
560
+ ->UseRealTime();
561
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
562
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
563
+ nullptr,
564
+ benchmark::utils::CheckNEONFP16ARITH)
565
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
566
+ ->UseRealTime();
567
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
568
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
569
+ nullptr,
570
+ benchmark::utils::CheckNEONFP16ARITH)
571
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
572
+ ->UseRealTime();
573
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
574
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
575
+ nullptr,
576
+ benchmark::utils::CheckNEONFP16ARITH)
577
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
578
+ ->UseRealTime();
579
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
580
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
581
+ nullptr,
582
+ benchmark::utils::CheckNEONFP16ARITH)
583
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
584
+ ->UseRealTime();
585
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
586
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
587
+ nullptr,
588
+ benchmark::utils::CheckNEONFP16ARITH)
589
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
590
+ ->UseRealTime();
591
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
592
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
593
+ nullptr,
594
+ benchmark::utils::CheckNEONFP16ARITH)
595
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
596
+ ->UseRealTime();
597
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
598
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
599
+ nullptr,
600
+ benchmark::utils::CheckNEONFP16ARITH)
601
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
602
+ ->UseRealTime();
603
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
604
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
605
+ nullptr,
606
+ benchmark::utils::CheckNEONFP16ARITH)
607
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
608
+ ->UseRealTime();
609
+ BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
610
+ xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
611
+ nullptr,
612
+ benchmark::utils::CheckNEONFP16ARITH)
613
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
614
+ ->UseRealTime();
615
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
616
+
617
+
618
+ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
619
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
620
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
621
+ nullptr,
622
+ benchmark::utils::CheckNEONFP16ARITH)
623
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
624
+ ->UseRealTime();
625
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
626
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
627
+ nullptr,
628
+ benchmark::utils::CheckNEONFP16ARITH)
629
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
630
+ ->UseRealTime();
631
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
632
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
633
+ nullptr,
634
+ benchmark::utils::CheckNEONFP16ARITH)
635
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
636
+ ->UseRealTime();
637
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
638
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
639
+ nullptr,
640
+ benchmark::utils::CheckNEONFP16ARITH)
641
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
642
+ ->UseRealTime();
643
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
644
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
645
+ nullptr,
646
+ benchmark::utils::CheckNEONFP16ARITH)
647
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
648
+ ->UseRealTime();
649
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
650
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
651
+ nullptr,
652
+ benchmark::utils::CheckNEONFP16ARITH)
653
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
654
+ ->UseRealTime();
655
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
656
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
657
+ nullptr,
658
+ benchmark::utils::CheckNEONFP16ARITH)
659
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
660
+ ->UseRealTime();
661
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
662
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
663
+ nullptr,
664
+ benchmark::utils::CheckNEONFP16ARITH)
665
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
666
+ ->UseRealTime();
667
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
668
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
669
+ nullptr,
670
+ benchmark::utils::CheckNEONFP16ARITH)
671
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
672
+ ->UseRealTime();
673
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
674
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
675
+ nullptr,
676
+ benchmark::utils::CheckNEONFP16ARITH)
677
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
678
+ ->UseRealTime();
679
+
680
+
681
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
682
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
683
+ nullptr,
684
+ benchmark::utils::CheckNEONFP16ARITH)
685
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
686
+ ->UseRealTime();
687
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
688
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
689
+ nullptr,
690
+ benchmark::utils::CheckNEONFP16ARITH)
691
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
692
+ ->UseRealTime();
693
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
694
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
695
+ nullptr,
696
+ benchmark::utils::CheckNEONFP16ARITH)
697
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
698
+ ->UseRealTime();
699
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
700
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
701
+ nullptr,
702
+ benchmark::utils::CheckNEONFP16ARITH)
703
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
704
+ ->UseRealTime();
705
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
706
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
707
+ nullptr,
708
+ benchmark::utils::CheckNEONFP16ARITH)
709
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
710
+ ->UseRealTime();
711
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
712
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
713
+ nullptr,
714
+ benchmark::utils::CheckNEONFP16ARITH)
715
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
716
+ ->UseRealTime();
717
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
718
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
719
+ nullptr,
720
+ benchmark::utils::CheckNEONFP16ARITH)
721
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
722
+ ->UseRealTime();
723
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
724
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
725
+ nullptr,
726
+ benchmark::utils::CheckNEONFP16ARITH)
727
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
728
+ ->UseRealTime();
729
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
730
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
731
+ nullptr,
732
+ benchmark::utils::CheckNEONFP16ARITH)
733
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
734
+ ->UseRealTime();
735
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
736
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
737
+ nullptr,
738
+ benchmark::utils::CheckNEONFP16ARITH)
739
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
740
+ ->UseRealTime();
741
+
742
+
743
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
744
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
745
+ nullptr,
746
+ benchmark::utils::CheckNEONFP16ARITH)
747
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
748
+ ->UseRealTime();
749
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
750
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
751
+ nullptr,
752
+ benchmark::utils::CheckNEONFP16ARITH)
753
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
754
+ ->UseRealTime();
755
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
756
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
757
+ nullptr,
758
+ benchmark::utils::CheckNEONFP16ARITH)
759
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
760
+ ->UseRealTime();
761
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
762
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
763
+ nullptr,
764
+ benchmark::utils::CheckNEONFP16ARITH)
765
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
766
+ ->UseRealTime();
767
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
768
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
769
+ nullptr,
770
+ benchmark::utils::CheckNEONFP16ARITH)
771
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
772
+ ->UseRealTime();
773
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
774
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
775
+ nullptr,
776
+ benchmark::utils::CheckNEONFP16ARITH)
777
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
778
+ ->UseRealTime();
779
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
780
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
781
+ nullptr,
782
+ benchmark::utils::CheckNEONFP16ARITH)
783
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
784
+ ->UseRealTime();
785
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
786
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
787
+ nullptr,
788
+ benchmark::utils::CheckNEONFP16ARITH)
789
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
790
+ ->UseRealTime();
791
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
792
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
793
+ nullptr,
794
+ benchmark::utils::CheckNEONFP16ARITH)
795
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
796
+ ->UseRealTime();
797
+ BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
798
+ xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
799
+ nullptr,
800
+ benchmark::utils::CheckNEONFP16ARITH)
801
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
802
+ ->UseRealTime();
803
+ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
804
+
805
+ #ifndef XNNPACK_BENCHMARK_NO_MAIN
806
+ BENCHMARK_MAIN();
807
+ #endif