Androidonnxfork
/

test

Model card Files Files and versions Community

Androidonnxfork commited on Aug 4, 2023

Commit

8b7c501

1 Parent(s): 842b645

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.bazelrc +57 -0
.clang-format +34 -0
.gitattributes +2 -0
.github/workflows/build.yml +207 -0
.gitignore +34 -14
BUILD.bazel +0 -0
CMakeLists.txt +0 -0
CONTRIBUTING.md +28 -0
LICENSE +31 -0
README.md +130 -12
WORKSPACE +89 -0
bench/abs.cc +277 -0
bench/average-pooling.cc +429 -0
bench/bankers-rounding.cc +277 -0
bench/batch-matrix-multiply.cc +259 -0
bench/bf16-gemm.cc +244 -0
bench/bgemm.h +70 -0
bench/ceiling.cc +277 -0
bench/channel-shuffle.cc +340 -0
bench/conv.h +852 -0
bench/convert.cc +1339 -0
bench/convolution.cc +1768 -0
bench/cs16-bfly4.cc +116 -0
bench/cs16-fftr.cc +73 -0
bench/cs16-vsquareabs.cc +127 -0
bench/dconv.h +54 -0
bench/deconvolution.cc +575 -0
bench/dwconv.h +368 -0
bench/elu.cc +460 -0
bench/end2end.cc +201 -0
bench/end2end.h +37 -0
bench/f16-conv-hwc2chw.cc +130 -0
bench/f16-dwconv-e2e.cc +736 -0
bench/f16-dwconv.cc +795 -0
bench/f16-dwconv2d-chw.cc +496 -0
bench/f16-f32-vcvt.cc +414 -0
bench/f16-f32acc-gemm.cc +162 -0
bench/f16-f32acc-igemm.cc +214 -0
bench/f16-f32acc-rsum.cc +140 -0
bench/f16-gavgpool-cw.cc +77 -0
bench/f16-gemm-e2e.cc +452 -0
bench/f16-gemm.cc +513 -0
bench/f16-igemm.cc +588 -0
bench/f16-raddstoreexpminusmax.cc +387 -0
bench/f16-rsum.cc +101 -0
bench/f16-spmm.cc +247 -0
bench/f16-velu.cc +104 -0
bench/f16-vsigmoid.cc +319 -0
bench/f16-vsqrt.cc +121 -0
bench/f16-vtanh.cc +807 -0

.bazelrc ADDED Viewed

	@@ -0,0 +1,57 @@

+# Basic build settings
+build --jobs 128
+build --cxxopt='-std=gnu++14'
+# Sets the default Apple platform to macOS.
+build --apple_platform_type=macos
+# Android configs.
+build:android --crosstool_top=//external:android/crosstool
+build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --linkopt=-ldl
+build:android --linkopt=-Wl,--gc-sections
+build:android_arm --config=android
+build:android_arm --cpu=armeabi-v7a
+build:android_arm --fat_apk_cpu=armeabi-v7a
+build:android_arm64 --config=android
+build:android_arm64 --cpu=arm64-v8a
+build:android_arm64 --fat_apk_cpu=arm64-v8a
+# iOS configs.
+build:ios --apple_platform_type=ios
+build:ios_i386 --config=ios
+build:ios_i386 --cpu=ios_i386
+build:ios_i386 --watchos_cpus=i386
+build:ios_x86_64 --config=ios
+build:ios_x86_64 --cpu=ios_x86_64
+build:ios_x86_64 --watchos_cpus=i386
+build:ios_armv7 --config=ios
+build:ios_armv7 --cpu=ios_armv7
+build:ios_armv7 --watchos_cpus=armv7k
+build:ios_arm64 --config=ios
+build:ios_arm64 --cpu=ios_arm64
+build:ios_arm64 --watchos_cpus=armv7k
+build:ios_arm64e --config=ios
+build:ios_arm64e --cpu=ios_arm64e
+build:ios_arm64e --watchos_cpus=armv7k
+build:ios_sim_arm64 --config=ios
+build:ios_sim_arm64 --cpu=ios_sim_arm64
+build:ios_sim_arm64 --watchos_cpus=armv7k
+build:ios_fat --config=ios
+build:ios_fat --ios_multi_cpus=armv7,arm64
+build:ios_fat --watchos_cpus=armv7k
+# macOS configs.
+build:macos --apple_platform_type=macos
+build:macos_arm64 --config=macos
+build:macos_arm64 --cpu=darwin_arm64

.clang-format ADDED Viewed

	@@ -0,0 +1,34 @@

+AllowShortFunctionsOnASingleLine: Inline
+PackConstructorInitializers: Never
+ColumnLimit: 120
+AlignAfterOpenBracket: AlwaysBreak
+BinPackParameters: false
+AllowAllParametersOfDeclarationOnNextLine: true
+BreakBeforeBraces: Stroustrup
+SpaceAfterCStyleCast: true
+PointerAlignment: Left
+ForEachMacros: ['XNN_UNPREDICTABLE', 'XNN_LIKELY', 'XNN_UNLIKELY']
+IfMacros: ['IF']
+IndentCaseLabels: true
+ContinuationIndentWidth: 2
+SpaceBeforeParens: Custom
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterIfMacros: true
+  AfterForeachMacros: false
+SpacesBeforeTrailingComments: 2
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '<xnnpack[./][[:alnum:].-]+>' # match XNNPack includes first
+    Priority: 5
+  - Regex: 'benchmark.h' # includes used in benchmarks
+    Priority: 3
+  - Regex: 'bench/' # includes used in benchmarks
+    Priority: 3
+  - Regex: 'gtest.h' # includes used in tests
+    Priority: 3
+  - Regex: 'gmock.h' # includes used in tests
+    Priority: 3
+  - Regex: '<[[:alnum:].]+>' # system headers
+    Priority: 2 # lower priority to keep it sorted first before XNNPack includes
+MaxEmptyLinesToKeep: 2 # used to separate includes from functions

.gitattributes CHANGED Viewed

@@ -121,3 +121,5 @@ fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/m
 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text

 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_decoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.ort filter=lfs diff=lfs merge=lfs -text
 fp16fullonnxsdquantized_in_ort/beautifulrealv6,majicmix,cyberreal,epicrealism,/majicmixRealistic_betterV2V25/vae_encoder/model.with_runtime_opt.ort filter=lfs diff=lfs merge=lfs -text
+build/CMakeFiles/microkernels-all.dir/build.make filter=lfs diff=lfs merge=lfs -text
+build/libXNNPACK.a filter=lfs diff=lfs merge=lfs -text

.github/workflows/build.yml ADDED Viewed

	@@ -0,0 +1,207 @@

+name: Build using CMake
+on:
+  push:
+    paths:
+      - '**.S'
+      - '**.c'
+      - '**.cc'
+      - '**.h'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'scripts/build-*.sh'
+      - '.github/**/*.yml'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+jobs:
+  cmake-linux-local:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Update apt
+        run: sudo apt update
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Configure and build
+        run: scripts/build-local.sh
+        working-directory: ${{ github.workspace }}
+  cmake-linux-aarch64:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v3
+      - name: Update apt
+        run: sudo apt update
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Install aarch64 cross-toolchain
+        run: sudo apt install crossbuild-essential-arm64
+      - name: Install qemu-aarch64
+        run: sudo apt install qemu-user
+      - name: Configure and build
+        run: scripts/build-linux-aarch64.sh -DCMAKE_BUILD_TYPE=Release
+        working-directory: ${{ github.workspace }}
+      - name: Run tests
+        run: ctest --output-on-failure --parallel $(nproc)
+        working-directory: ${{ github.workspace }}/build/linux/aarch64
+  cmake-linux-armhf:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v3
+      - name: Update apt
+        run: sudo apt update
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Install armhf cross-toolchain
+        run: sudo apt install crossbuild-essential-armhf
+      - name: Install qemu-arm
+        run: sudo apt install qemu-user
+      - name: Configure and build
+        run: scripts/build-linux-armhf.sh -DCMAKE_BUILD_TYPE=Release
+        working-directory: ${{ github.workspace }}
+      - name: Run tests
+        run: ctest --output-on-failure --parallel $(nproc)
+        working-directory: ${{ github.workspace }}/build/linux/armhf
+  cmake-linux-riscv64:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Update apt
+        run: sudo apt update
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Install riscv64 cross-toolchain
+        run: sudo apt install crossbuild-essential-riscv64
+      - name: Install qemu-riscv64
+        run: sudo apt install qemu-user
+      - name: Configure and build
+        run: scripts/build-linux-riscv64.sh -DCMAKE_BUILD_TYPE=Release -DXNNPACK_ENABLE_RISCV_VECTOR=OFF
+        working-directory: ${{ github.workspace }}
+      - name: Run tests
+        run: ctest --output-on-failure --parallel $(nproc)
+        working-directory: ${{ github.workspace }}/build/linux/riscv64
+  cmake-windows-arm64:
+    runs-on: windows-latest
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v3
+      - name: Configure and build
+        run: scripts/build-windows-arm64.cmd
+        shell: cmd
+        working-directory: ${{ github.workspace }}
+  cmake-windows-x64:
+    runs-on: windows-latest
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v3
+      - name: Configure and build
+        run: scripts/build-windows-x64.cmd
+        shell: cmd
+        working-directory: ${{ github.workspace }}
+        env:
+          CFLAGS: "/UNDEBUG"
+          CXXFLAGS: "/UNDEBUG"
+      - name: Run tests
+        run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
+        working-directory: ${{ github.workspace }}/build/windows/x64
+  cmake-windows-x86:
+    runs-on: windows-latest
+    timeout-minutes: 120
+    steps:
+      - uses: actions/checkout@v3
+      - name: Configure and build
+        run: scripts/build-windows-x86.cmd
+        shell: cmd
+        working-directory: ${{ github.workspace }}
+        env:
+          CFLAGS: "/UNDEBUG"
+          CXXFLAGS: "/UNDEBUG"
+      - name: Run tests
+        run: ctest -C Release --output-on-failure --parallel %NUMBER_OF_PROCESSORS%
+        working-directory: ${{ github.workspace }}/build/windows/x86
+  cmake-macos-arm64:
+    runs-on: macos-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Create output directory
+        run: mkdir -p build/macos/arm64
+        working-directory: ${{ github.workspace }}
+      - name: Generate CMake project
+        run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=arm64 -DHAVE_STD_REGEX=TRUE ../../..
+        working-directory: ${{ github.workspace }}/build/macos/arm64
+      - name: Build with Xcode
+        run: cmake --build build/macos/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
+        working-directory: ${{ github.workspace }}
+  cmake-macos-x86_64:
+    runs-on: macos-latest
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v3
+      - name: Create output directory
+        run: mkdir -p build/macos/x86_64
+        working-directory: ${{ github.workspace }}
+      - name: Generate CMake project
+        run: cmake -G Xcode -DCMAKE_OSX_ARCHITECTURES=x86_64 -DHAVE_STD_REGEX=TRUE ../../..
+        working-directory: ${{ github.workspace }}/build/macos/x86_64
+      - name: Build with Xcode
+        run: cmake --build build/macos/x86_64 --parallel $(sysctl -n hw.ncpu) -- -quiet
+        working-directory: ${{ github.workspace }}
+      - name: Run tests
+        run: ctest --build-config Debug --output-on-failure --parallel $(sysctl -n hw.ncpu)
+        working-directory: ${{ github.workspace }}/build/macos/x86_64
+  cmake-android:
+    strategy:
+      matrix:
+        script: [build-android-arm64.sh, build-android-armv7.sh, build-android-x86.sh]
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v3
+      - name: Update apt
+        run: sudo apt update
+      - name: Install ninja
+        run: sudo apt install ninja-build
+      - name: Setup Android NDK
+        id: setup-ndk
+        uses: nttld/setup-ndk@v1
+        with:
+          ndk-version: r23b
+          add-to-path: false
+      - name: Configure and build
+        run: scripts/${{ matrix.script }}
+        working-directory: ${{ github.workspace }}
+        env:
+          ANDROID_NDK: ${{ steps.setup-ndk.outputs.ndk-path }}
+  cmake-ios-arm64:
+    runs-on: macos-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Create output directory
+        run: mkdir -p build/ios/arm64
+        working-directory: ${{ github.workspace }}
+      - name: Generate CMake project
+        run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=arm64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
+        working-directory: ${{ github.workspace }}/build/ios/arm64
+      - name: Build with Xcode
+        run: cmake --build build/ios/arm64 --parallel $(sysctl -n hw.ncpu) -- -quiet
+        working-directory: ${{ github.workspace }}
+  cmake-ios-x86_64:
+    runs-on: macos-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Create output directory
+        run: mkdir -p build/ios/x86_64
+        working-directory: ${{ github.workspace }}
+      - name: Generate CMake project
+        run: cmake -G Xcode -DCMAKE_SYSTEM_NAME=iOS -DCMAKE_OSX_ARCHITECTURES=x86_64 -DXNNPACK_BUILD_BENCHMARKS=OFF -DXNNPACK_BUILD_TESTS=OFF ../../..
+        working-directory: ${{ github.workspace }}/build/ios/x86_64
+      - name: Build with Xcode
+        run: cmake --build build/ios/x86_64 --parallel $(sysctl -n hw.ncpu) -- -sdk iphonesimulator -quiet
+        working-directory: ${{ github.workspace }}

.gitignore CHANGED Viewed

@@ -1,15 +1,35 @@
-*.iml
-.gradle
-/local.properties
-/.idea/caches
-/.idea/libraries
-/.idea/modules.xml
-/.idea/workspace.xml
-/.idea/navEditor.xml
-/.idea/assetWizardSettings.xml
 .DS_Store
-/build
-/captures
-.externalNativeBuild
-.cxx
-local.properties

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Build objects and artifacts
+bazel-bin
+bazel-genfiles
+bazel-out
+bazel-testlogs
+bazel-XNNPACK
+bin/
+build/
+build-*/
+deps/
+lib/
+libs/
+obj/
+out/
+*.pyc
+*.pyo
+*.log
+# System files
 .DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+*.swp

BUILD.bazel ADDED Viewed

The diff for this file is too large to render. See raw diff

CMakeLists.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# How to Contribute
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+## Contributor License Agreement
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+## Code reviews
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+## Community Guidelines
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google.com/conduct/).

LICENSE ADDED Viewed

	@@ -0,0 +1,31 @@

+BSD License
+For XNNPACK software
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+Copyright 2019 Google LLC
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md CHANGED Viewed

@@ -1,12 +1,130 @@
----
-title: Test
-emoji: 📊
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 3.38.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# XNNPACK
+XNNPACK is a highly optimized solution for neural network inference on ARM, x86, WebAssembly, and RISC-V platforms. XNNPACK is not intended for direct use by deep learning practitioners and researchers; instead it provides low-level performance primitives for accelerating high-level machine learning frameworks, such as [TensorFlow Lite](https://www.tensorflow.org/lite), [TensorFlow.js](https://www.tensorflow.org/js), [PyTorch](https://pytorch.org/), [ONNX Runtime](https://onnxruntime.ai), and [MediaPipe](https://mediapipe.dev).
+## Supported Architectures
+- ARM64 on Android, iOS, macOS, Linux, and Windows
+- ARMv7 (with NEON) on Android
+- ARMv6 (with VFPv2) on Linux
+- x86 and x86-64 (up to AVX512) on Windows, Linux, macOS, Android, and iOS simulator
+- WebAssembly MVP
+- WebAssembly SIMD
+- [WebAssembly Relaxed SIMD](https://github.com/WebAssembly/relaxed-simd) (experimental)
+- RISC-V (RV32GC and RV64GC)
+## Operator Coverage
+XNNPACK implements the following neural network operators:
+- 2D Convolution (including grouped and depthwise)
+- 2D Deconvolution (AKA Transposed Convolution)
+- 2D Average Pooling
+- 2D Max Pooling
+- 2D ArgMax Pooling (Max Pooling + indices)
+- 2D Unpooling
+- 2D Bilinear Resize
+- 2D Depth-to-Space (AKA Pixel Shuffle)
+- Add (including broadcasting, two inputs only)
+- Subtract (including broadcasting)
+- Divide (including broadcasting)
+- Maximum (including broadcasting)
+- Minimum (including broadcasting)
+- Multiply (including broadcasting)
+- Squared Difference (including broadcasting)
+- Global Average Pooling
+- Channel Shuffle
+- Fully Connected
+- Abs (absolute value)
+- Bankers' Rounding (rounding to nearest, ties to even)
+- Ceiling (rounding to integer above)
+- Clamp (includes ReLU and ReLU6)
+- Convert (includes fixed-point and half-precision quantization and
+  dequantization)
+- Copy
+- ELU
+- Floor (rounding to integer below)
+- HardSwish
+- Leaky ReLU
+- Negate
+- Sigmoid
+- Softmax
+- Square
+- Tanh
+- Transpose
+- Truncation (rounding to integer towards zero)
+- PReLU
+All operators in XNNPACK support NHWC layout, but additionally allow custom stride along the **C**hannel dimension. Thus, operators can consume a subset of channels in the input tensor, and produce a subset of channels in the output tensor, providing a zero-cost Channel Split and Channel Concatenation operations.
+## Performance
+### Mobile phones
+The table below presents **single-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
+| Model                   | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
+| ----------------------- | :-------: | :---------: | :----------: |
+| FP32 MobileNet v1 1.0X  |    82     |      86     |      88      |
+| FP32 MobileNet v2 1.0X  |    49     |      53     |      55      |
+| FP32 MobileNet v3 Large |    39     |      42     |      44      |
+| FP32 MobileNet v3 Small |    12     |      14     |      14      |
+The following table presents **multi-threaded** (using as many threads as there are big cores) performance of XNNPACK library on three generations of MobileNet models and three generations of Pixel phones.
+| Model                   | Pixel, ms | Pixel 2, ms | Pixel 3a, ms |
+| ----------------------- | :-------: | :---------: | :----------: |
+| FP32 MobileNet v1 1.0X  |    43     |      27     |      46      |
+| FP32 MobileNet v2 1.0X  |    26     |      18     |      28      |
+| FP32 MobileNet v3 Large |    22     |      16     |      24      |
+| FP32 MobileNet v3 Small |     7     |       6     |       8      |
+Benchmarked on March 27, 2020 with `end2end_bench --benchmark_min_time=5` on an Android/ARM64 build with Android NDK r21 (`bazel build -c opt --config android_arm64 :end2end_bench`) and neural network models with randomized weights and inputs.
+### Raspberry Pi
+The table below presents **multi-threaded** performance of XNNPACK library on three generations of MobileNet models and three generations of Raspberry Pi boards.
+| Model                   | RPi Zero W (BCM2835), ms | RPi 2 (BCM2836), ms | RPi 3+ (BCM2837B0), ms | RPi 4 (BCM2711), ms | RPi 4 (BCM2711, ARM64), ms |
+| ----------------------- | :----------------------: | :-----------------: | :--------------------: | :-----------------: | :------------------------: |
+| FP32 MobileNet v1 1.0X  |          3919            |         302         |          114           |          72         |             77             |
+| FP32 MobileNet v2 1.0X  |          1987            |         191         |           79           |          41         |             46             |
+| FP32 MobileNet v3 Large |          1658            |         161         |           67           |          38         |             40             |
+| FP32 MobileNet v3 Small |           474            |          50         |           22           |          13         |             15             |
+| INT8 MobileNet v1 1.0X  |          2589            |         128         |           46           |          29         |             24             |
+| INT8 MobileNet v2 1.0X  |          1495            |          82         |           30           |          20         |             17             |
+Benchmarked on Feb 8, 2022 with `end2end-bench --benchmark_min_time=5` on a Raspbian Buster build with CMake (`./scripts/build-local.sh`) and neural network models with randomized weights and inputs. INT8 inference was evaluated on per-channel quantization schema.
+## Minimum build requirements
+- C11
+- C++14
+- Python 3
+## Publications
+- Marat Dukhan "The Indirect Convolution Algorithm". Presented on [Efficient Deep Learning for Compute Vision (ECV) 2019](https://sites.google.com/corp/view/ecv2019/) workshop ([slides](https://drive.google.com/file/d/1ZayB3By5ZxxQIRtN7UDq_JvPg1IYd3Ac/view), [paper on ArXiv](https://arxiv.org/abs/1907.02129)).
+- Erich Elsen, Marat Dukhan, Trevor Gale, Karen Simonyan "Fast Sparse ConvNets".
+  [Paper on ArXiv](https://arxiv.org/abs/1911.09723), [pre-trained sparse
+  models](https://github.com/google-research/google-research/tree/master/fastconvnets).
+- Marat Dukhan, Artsiom Ablavatski "The Two-Pass Softmax Algorithm".
+  [Paper on ArXiv](https://arxiv.org/abs/2001.04438).
+- Yury Pisarchyk, Juhyun Lee "Efficient Memory Management for Deep Neural Net Inference".
+  [Paper on ArXiv](https://arxiv.org/abs/2001.03288).
+## Ecosystem
+### Machine Learning Frameworks
+- [TensorFlow Lite](https://blog.tensorflow.org/2020/07/accelerating-tensorflow-lite-xnnpack-integration.html).
+- [TensorFlow.js WebAssembly backend](https://blog.tensorflow.org/2020/03/introducing-webassembly-backend-for-tensorflow-js.html).
+- [PyTorch Mobile](https://pytorch.org/mobile).
+- [ONNX Runtime Mobile](https://onnxruntime.ai/docs/execution-providers/Xnnpack-ExecutionProvider.html)
+- [MediaPipe for the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html).
+- [Alibaba HALO (Heterogeneity-Aware Lowering and Optimization)](https://github.com/alibaba/heterogeneity-aware-lowering-and-optimization)
+- [Samsung ONE (On-device Neural Engine)](https://github.com/Samsung/ONE)
+## Acknowledgements
+XNNPACK is a based on [QNNPACK](https://github.com/pytorch/QNNPACK) library. Over time its codebase diverged a lot, and XNNPACK API is no longer compatible with QNNPACK.

WORKSPACE ADDED Viewed

	@@ -0,0 +1,89 @@

+workspace(name = "xnnpack")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+# Bazel rule definitions
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-main",
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/main.zip"],
+)
+# Bazel Skylib.
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+    ],
+)
+# Google Test framework, used by most unit-tests.
+http_archive(
+    name = "com_google_googletest",
+    sha256 = "5cb522f1427558c6df572d6d0e1bf0fd076428633d080e88ad5312be0b6a8859",
+    strip_prefix = "googletest-e23cdb78e9fef1f69a9ef917f447add5638daf2a",
+    urls = ["https://github.com/google/googletest/archive/e23cdb78e9fef1f69a9ef917f447add5638daf2a.zip"],
+)
+# Google Benchmark library, used in micro-benchmarks.
+http_archive(
+    name = "com_google_benchmark",
+    sha256 = "1ba14374fddcd9623f126b1a60945e4deac4cdc4fb25a5f25e7f779e36f2db52",
+    strip_prefix = "benchmark-d2a8a4ee41b923876c034afb939c4fc03598e622",
+    urls = ["https://github.com/google/benchmark/archive/d2a8a4ee41b923876c034afb939c4fc03598e622.zip"],
+)
+# FP16 library, used for half-precision conversions
+http_archive(
+    name = "FP16",
+    build_file = "@//third_party:FP16.BUILD",
+    sha256 = "e66e65515fa09927b348d3d584c68be4215cfe664100d01c9dbc7655a5716d70",
+    strip_prefix = "FP16-0a92994d729ff76a58f692d3028ca1b64b145d91",
+    urls = [
+        "https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip",
+    ],
+)
+# FXdiv library, used for repeated integer division by the same factor
+http_archive(
+    name = "FXdiv",
+    sha256 = "ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d",
+    strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1",
+    urls = ["https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip"],
+)
+# pthreadpool library, used for parallelization
+http_archive(
+    name = "pthreadpool",
+    sha256 = "e6370550a1abf1503daf3c2c196e0a1c2b253440c39e1a57740ff49af2d8bedf",
+    strip_prefix = "pthreadpool-43edadc654d6283b4b6e45ba09a853181ae8e850",
+    urls = ["https://github.com/Maratyszcza/pthreadpool/archive/43edadc654d6283b4b6e45ba09a853181ae8e850.zip"],
+)
+# cpuinfo library, used for detecting processor characteristics
+http_archive(
+    name = "cpuinfo",
+    sha256 = "609fc42c47482c1fc125dccac65e843f640e792540162581c4b7eb6ff81c826a",
+    strip_prefix = "cpuinfo-87d8234510367db49a65535021af5e1838a65ac2",
+    urls = [
+        "https://github.com/pytorch/cpuinfo/archive/87d8234510367db49a65535021af5e1838a65ac2.zip",
+    ],
+)
+# Ruy library, used to benchmark against
+http_archive(
+    name = "ruy",
+    sha256 = "fe8345f521bb378745ebdd0f8c5937414849936851d2ec2609774eb2d7098e54",
+    strip_prefix = "ruy-9f53ba413e6fc879236dcaa3e008915973d67a4f",
+    urls = [
+        "https://github.com/google/ruy/archive/9f53ba413e6fc879236dcaa3e008915973d67a4f.zip",
+    ],
+)
+# Android NDK location and version is auto-detected from $ANDROID_NDK_HOME environment variable
+android_ndk_repository(name = "androidndk")
+# Android SDK location and API is auto-detected from $ANDROID_HOME environment variable
+android_sdk_repository(name = "androidsdk")

bench/abs.cc ADDED Viewed

	@@ -0,0 +1,277 @@

+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <fp16/fp16.h>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+static void xnnpack_abs_f16(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<uint16_t> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t abs_op = nullptr;
+  status = xnn_create_abs_nc_f16(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &abs_op);
+  if (status != xnn_status_success || abs_op == nullptr) {
+    state.SkipWithError("failed to create Abs operator");
+    return;
+  }
+  status = xnn_reshape_abs_nc_f16(abs_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Abs operator");
+    return;
+  }
+  status = xnn_setup_abs_nc_f16(abs_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Abs operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(abs_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Abs operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(abs_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Abs operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void xnnpack_abs_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::vector<float> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t abs_op = nullptr;
+  status = xnn_create_abs_nc_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &abs_op);
+  if (status != xnn_status_success || abs_op == nullptr) {
+    state.SkipWithError("failed to create Abs operator");
+    return;
+  }
+  status = xnn_reshape_abs_nc_f32(abs_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Abs operator");
+    return;
+  }
+  status = xnn_setup_abs_nc_f32(abs_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Abs operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(abs_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Abs operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(abs_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Abs operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+static void tflite_abs_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  const flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_ABS);
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+  }};
+  const std::array<int32_t, 1> op_inputs{{ 0 }};
+  const std::array<int32_t, 1> op_outputs{{ 1 }};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{ 0 }};
+  const std::array<int32_t, 1> graph_outputs{{ 1 }};
+  const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("Abs model"),
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK(xnnpack_abs_f16)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_abs_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK(tflite_abs_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/average-pooling.cc ADDED Viewed

	@@ -0,0 +1,429 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#include "bench/utils.h"
+static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t pooling_size = state.range(3);
+  const size_t padding_size = state.range(4);
+  const size_t stride = state.range(5);
+  const size_t channels = state.range(6);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+  const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
+  const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
+  std::vector<uint8_t> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(uint8_t));
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::vector<uint8_t> output(batch_size * output_height * output_width * channels);
+  std::fill(output.begin(), output.end(), 0xA5);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t pooling_op = nullptr;
+  status = xnn_create_average_pooling2d_nhwc_qu8(
+    padding_size, padding_size, padding_size, padding_size,
+    pooling_size, pooling_size,
+    stride, stride,
+    channels, channels /* input pixel stride */, channels /* output pixel stride */,
+    127 /* input zero point */, 0.75f /* input scale */,
+    127 /* output zero point */, 1.25f /* output scale */,
+    0, 255,
+    0 /* flags */, &pooling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create Average Pooling operator");
+    return;
+  }
+  status = xnn_reshape_average_pooling2d_nhwc_qu8(
+    pooling_op,
+    batch_size, input_height, input_width,
+    /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+    nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Average Pooling operator");
+    return;
+  }
+  status = xnn_setup_average_pooling2d_nhwc_qu8(
+    pooling_op,
+    input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Average Pooling operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Average Pooling operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(pooling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Average Pooling operator");
+    return;
+  }
+  pooling_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) *
+      batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(uint8_t),
+    benchmark::Counter::kIsRate);
+}
+static void xnnpack_average_pooling_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t pooling_size = state.range(3);
+  const size_t padding_size = state.range(4);
+  const size_t stride = state.range(5);
+  const size_t channels = state.range(6);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
+  const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
+  std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<float> output(batch_size * output_height * output_width * channels);
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t pooling_op = nullptr;
+  status = xnn_create_average_pooling2d_nhwc_f32(
+    padding_size, padding_size, padding_size, padding_size,
+    pooling_size, pooling_size,
+    stride, stride,
+    channels, channels /* input pixel stride */, channels /* output pixel stride */,
+    -std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity(),
+    0 /* flags */, &pooling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create Average Pooling operator");
+    return;
+  }
+  status = xnn_reshape_average_pooling2d_nhwc_f32(
+    pooling_op,
+    batch_size, input_height, input_width,
+    /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+    nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Average Pooling operator");
+    return;
+  }
+  status = xnn_setup_average_pooling2d_nhwc_f32(
+    pooling_op,
+    input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Average Pooling operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(pooling_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Average Pooling operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(pooling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Average Pooling operator");
+    return;
+  }
+  pooling_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) *
+      batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
+    benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+void tflite_average_pooling_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t pooling_size = state.range(3);
+  const size_t padding_size = state.range(4);
+  const size_t stride = state.range(5);
+  const size_t channels = state.range(6);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  tflite::Padding padding = tflite::Padding_VALID;
+  if (2 * padding_size == (pooling_size - 1)) {
+    padding = tflite::Padding_SAME;
+  } else if (padding_size == 0) {
+    padding = tflite::Padding_VALID;
+  } else {
+    state.SkipWithError("unsupported padding");
+    return;
+  }
+  const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
+  const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;
+  std::vector<float> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<float> output(batch_size * output_height * output_width * channels);
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_AVERAGE_POOL_2D);
+  flatbuffers::Offset<tflite::Pool2DOptions> pool2d_options = CreatePool2DOptions(
+      builder, padding,
+      stride /* stride_w */, stride /* stride_h */,
+      pooling_size /* filter_width */, pooling_size /* filter_height */,
+      tflite::ActivationFunctionType_NONE);
+  flatbuffers::Offset<tflite::Buffer> buffers[1] = {
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  };
+  const int32_t input_shape[4] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(input_height),
+    static_cast<int32_t>(input_width),
+    static_cast<int32_t>(channels)
+  };
+  const int32_t output_shape[4] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(output_height),
+    static_cast<int32_t>(output_width),
+    static_cast<int32_t>(channels)
+  };
+  flatbuffers::Offset<tflite::Tensor> tensors[2] = {
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(input_shape, 4),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(output_shape, 4),
+                         tflite::TensorType_FLOAT32),
+  };
+  const int32_t op_inputs[1] = { 0 };
+  const int32_t op_outputs[1] = { 1 };
+  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs, 1),
+      builder.CreateVector<int32_t>(op_outputs, 1),
+      tflite::BuiltinOptions_Pool2DOptions,
+      pool2d_options.Union());
+  const int32_t graph_inputs[1] = { 0 };
+  const int32_t graph_outputs[1] = { 1 };
+  flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors, 2),
+      builder.CreateVector<int32_t>(graph_inputs, 1),
+      builder.CreateVector<int32_t>(graph_outputs, 1),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("AVERAGE_POOL_2D model"),
+      builder.CreateVector(buffers, 1));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  if (interpreter == nullptr) {
+    state.SkipWithError("TFLite interpreter is null");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size * input_height * input_width * channels,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) *
+      batch_size * (input_height * input_width + output_height * output_width) * channels * sizeof(float),
+    benchmark::Counter::kIsRate);
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+// Final global average pooling in ImageNet classification models.
+static void ImageNet(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W   K  P  S   C */
+  b->Args({1, 13, 13, 13, 0, 1, 1000});
+  b->Args({1,  7,  7,  7, 0, 1, 1000});
+}
+// ShuffleNet v1 with 1 group.
+static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W  K  P  S   C */
+  b->Args({1, 56, 56, 3, 1, 2,  24});
+  b->Args({1, 28, 28, 3, 1, 2, 144});
+  b->Args({1, 14, 14, 3, 1, 2, 288});
+  b->Args({1,  7,  7, 3, 1, 2, 576});
+}
+// ShuffleNet v1 with 2 groups.
+static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W  K  P  S   C */
+  b->Args({1, 56, 56, 3, 1, 2,  24});
+  b->Args({1, 28, 28, 3, 1, 2, 200});
+  b->Args({1, 14, 14, 3, 1, 2, 400});
+  b->Args({1,  7,  7, 3, 1, 2, 800});
+}
+// ShuffleNet v1 with 3 groups.
+static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W  K  P  S   C */
+  b->Args({1, 56, 56, 3, 1, 2,  24});
+  b->Args({1, 28, 28, 3, 1, 2, 240});
+  b->Args({1, 14, 14, 3, 1, 2, 480});
+  b->Args({1,  7,  7, 3, 1, 2, 960});
+}
+// ShuffleNet v1 with 4 groups.
+static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W  K  P  S    C */
+  b->Args({1, 56, 56, 3, 1, 2,   24});
+  b->Args({1, 28, 28, 3, 1, 2,  272});
+  b->Args({1, 14, 14, 3, 1, 2,  576});
+  b->Args({1,  7,  7, 3, 1, 2, 1088});
+}
+// ShuffleNet v1 with 8 groups.
+static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "K", "P", "S", "C"});
+  /*       N   H   W  K  P  S    C */
+  b->Args({1, 56, 56, 3, 1, 2,   24});
+  b->Args({1, 28, 28, 3, 1, 2,  384});
+  b->Args({1, 14, 14, 3, 1, 2,  768});
+  b->Args({1,  7,  7, 3, 1, 2, 1536});
+}
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(tflite_average_pooling_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, imagenet, "ImageNet")->Apply(ImageNet)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_average_pooling_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/bankers-rounding.cc ADDED Viewed

	@@ -0,0 +1,277 @@

+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <fp16/fp16.h>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+static void xnnpack_bankers_rounding_f16(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<uint16_t> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t bankers_rounding_op = nullptr;
+  status = xnn_create_bankers_rounding_nc_f16(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &bankers_rounding_op);
+  if (status != xnn_status_success || bankers_rounding_op == nullptr) {
+    state.SkipWithError("failed to create Bankers' Rounding operator");
+    return;
+  }
+  status = xnn_reshape_bankers_rounding_nc_f16(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Bankers' Rounding operator");
+    return;
+  }
+  status = xnn_setup_bankers_rounding_nc_f16(bankers_rounding_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Bankers' Rounding operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Bankers' Rounding operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(bankers_rounding_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Bankers' Rounding operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void xnnpack_bankers_rounding_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::vector<float> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t bankers_rounding_op = nullptr;
+  status = xnn_create_bankers_rounding_nc_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &bankers_rounding_op);
+  if (status != xnn_status_success || bankers_rounding_op == nullptr) {
+    state.SkipWithError("failed to create Bankers' Rounding operator");
+    return;
+  }
+  status = xnn_reshape_bankers_rounding_nc_f32(bankers_rounding_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Bankers' Rounding operator");
+    return;
+  }
+  status = xnn_setup_bankers_rounding_nc_f32(bankers_rounding_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Bankers' Rounding operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(bankers_rounding_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Bankers' Rounding operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(bankers_rounding_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Bankers' Rounding operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+static void tflite_bankers_rounding_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  const flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_ROUND);
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+  }};
+  const std::array<int32_t, 1> op_inputs{{ 0 }};
+  const std::array<int32_t, 1> op_outputs{{ 1 }};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{ 0 }};
+  const std::array<int32_t, 1> graph_outputs{{ 1 }};
+  const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("Round model"),
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK(xnnpack_bankers_rounding_f16)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_bankers_rounding_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK(tflite_bankers_rounding_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/batch-matrix-multiply.cc ADDED Viewed

	@@ -0,0 +1,259 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+void xnnpack_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t m = state.range(1);
+  const size_t k = state.range(1);
+  const size_t n = state.range(1);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  std::vector<float> input1(batch_size * m * k);
+  std::generate(input1.begin(), input1.end(), std::ref(f32rng));
+  std::vector<float> input2(batch_size * k * n);
+  std::generate(input2.begin(), input2.end(), std::ref(f32rng));
+  const size_t output_elements = batch_size * m * n;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers =
+    1 + benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), sizeof(float) * (output_elements));
+  std::vector<float> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> ops(num_buffers);
+  for (xnn_operator_t& op : ops) {
+    status = xnn_create_batch_matrix_multiply_nc_f32(/*flags=*/0, &op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create FP32 Convolution operator");
+      return;
+    }
+  }
+  std::vector<std::unique_ptr<std::vector<char>>> workspaces;
+  for (xnn_operator_t& op : ops) {
+    size_t workspace_size = 0;
+    size_t workspace_alignment = 0;
+    status =
+      xnn_reshape_batch_matrix_multiply_nc_f32(op, batch_size, m, k, n, &workspace_size, &workspace_alignment, nullptr);
+    auto workspace = std::make_unique<std::vector<char>>(workspace_size);
+    char* workspace_ptr = workspace->data();
+    workspaces.push_back(std::move(workspace));
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create FP32 Convolution operator");
+      return;
+    }
+    status = xnn_setup_batch_matrix_multiply_nc_f32(op, workspace_ptr, input1.data(), input2.data(), output.data());
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(ops[buffer_index], /*threadpool=*/nullptr);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run FP32 Convolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& convolution_op : ops) {
+    status = xnn_delete_operator(convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete FP32 Convolution operator");
+      return;
+    }
+    convolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * batch_size * m * k * n,
+    benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+void tflite_batch_matrix_multiply_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t m = state.range(1);
+  const size_t k = state.range(1);
+  const size_t n = state.range(1);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  std::vector<float> input1(batch_size * m * k);
+  std::generate(input1.begin(), input1.end(), std::ref(f32rng));
+  std::vector<float> input2(batch_size * k * n);
+  std::generate(input2.begin(), input2.end(), std::ref(f32rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+    CreateOperatorCode(builder, tflite::BuiltinOperator_BATCH_MATMUL, 0);
+  flatbuffers::Offset<tflite::BatchMatMulOptions> batch_mat_mul_options =
+    tflite::CreateBatchMatMulOptions(builder, false, false, false);
+  flatbuffers::Offset<tflite::Buffer> buffers[1] = {
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  };
+  const int32_t input1_shape[3] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(m),
+    static_cast<int32_t>(k),
+  };
+  const int32_t input2_shape[3] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(k),
+    static_cast<int32_t>(n),
+  };
+  const int32_t output_shape[3] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(m),
+    static_cast<int32_t>(n),
+  };
+  flatbuffers::Offset<tflite::Tensor> tensors[4] = {
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(input1_shape, 3),
+                         tflite::TensorType_FLOAT32,
+                         0 /* buffer id */,
+                         builder.CreateString("input1")),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(input2_shape, 3),
+                         tflite::TensorType_FLOAT32,
+                         0 /* buffer id */,
+                         builder.CreateString("input2")),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(output_shape, 2),
+                         tflite::TensorType_FLOAT32,
+                         0 /* buffer id */,
+                         builder.CreateString("output")),
+  };
+  const int32_t op_inputs[2] = { 0, 1 };
+  const int32_t op_outputs[1] = { 2 };
+  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs, 2),
+      builder.CreateVector<int32_t>(op_outputs, 1),
+      tflite::BuiltinOptions_BatchMatMulOptions,
+      batch_mat_mul_options.Union());
+  const int32_t graph_inputs[2] = { 0, 1 };
+  const int32_t graph_outputs[1] = { 2 };
+  flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors, 3),
+      builder.CreateVector<int32_t>(graph_inputs, 2),
+      builder.CreateVector<int32_t>(graph_outputs, 1),
+      builder.CreateVector(&op, 1),
+      builder.CreateString("BatchMatMul subgraph"));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("BatchMatMul model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers, 1));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  if (interpreter == nullptr) {
+    state.SkipWithError("TFLite interpreter is null");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size * m * k,
+    std::ref(f32rng));
+  std::generate(
+    interpreter->typed_tensor<float>(1),
+    interpreter->typed_tensor<float>(1) + batch_size * k * n,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * batch_size * m * k * n,
+    benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/bf16-gemm.cc ADDED Viewed

	@@ -0,0 +1,244 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/gemm.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+static void bf16_gemm(benchmark::State& state,
+  xnn_bf16_gemm_minmax_ukernel_fn gemm,
+  size_t mr, size_t nr, size_t kr, size_t sr,
+  xnn_init_bf16_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t mc = state.range(0);
+  const size_t nc = state.range(1);
+  const size_t kc = state.range(2);
+  const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
+  std::vector<uint16_t> k(nc * kc);
+  std::generate(k.begin(), k.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
+  std::vector<uint16_t> b(nc);
+  std::generate(b.begin(), b.end(), [&] { return fp32_to_bits(f32rng(rng)) >> 16; });
+  const size_t w_elements = nc_stride * kc_stride + nc_stride;
+  const size_t c_elements = mc * nc;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7FC0) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_bf16_minmax_params params;
+  init_params(&params,
+    UINT16_C(0xFF80)  /* -inf */, UINT16_C(0x7F80)  /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    // Use circular buffers (exceeding cache size) and prefetch to control cache state:
+    // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
+    // - W is not in cache (for any cache level)
+    // - C is not in cache (for any cache level)
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < mc; m += mr) {
+      const uint32_t mb = min(mc - m, mr);
+      for (uint32_t n = 0; n < nc; n += nr) {
+        const uint32_t nb = min(nc - n, nr);
+        gemm(
+          mb, nb, kc * sizeof(uint16_t),
+          a.data() + m * kc, kc * sizeof(uint16_t),
+          w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
+          c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
+          &params);
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, 1, 8, 2, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, 4, 8, 2, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, 5, 8, 2, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, 6, 8, 2, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_1x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, 1, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_2x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, 2, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_3x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, 3, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_4x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, 4, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_5x4c8__neonbf16_bfdot(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, 5, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_1x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, 1, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_2x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, 2, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_3x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, 3, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_4x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, 4, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  static void bf16_gemm_5x4c8__neonbf16_bfmlal(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, 5, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONBF16);
+  }
+  BENCHMARK_GEMM(bf16_gemm_1x8c2__neonbf16_bfdot_lane_ld128)
+  BENCHMARK_GEMM(bf16_gemm_4x8c2__neonbf16_bfdot_lane_ld128)
+  BENCHMARK_GEMM(bf16_gemm_5x8c2__neonbf16_bfdot_lane_ld128)
+  BENCHMARK_GEMM(bf16_gemm_6x8c2__neonbf16_bfdot_lane_ld128)
+  BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfdot)
+  BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfdot)
+  BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfdot)
+  BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfdot)
+  BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfdot)
+  BENCHMARK_GEMM(bf16_gemm_1x4c8__neonbf16_bfmlal)
+  BENCHMARK_GEMM(bf16_gemm_2x4c8__neonbf16_bfmlal)
+  BENCHMARK_GEMM(bf16_gemm_3x4c8__neonbf16_bfmlal)
+  BENCHMARK_GEMM(bf16_gemm_4x4c8__neonbf16_bfmlal)
+  BENCHMARK_GEMM(bf16_gemm_5x4c8__neonbf16_bfmlal)
+#endif  // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  static void bf16_gemm_1x4c8__neonfma_zip(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, 1, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_2x4c8__neonfma_zip(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, 2, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_3x4c8__neonfma_zip(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, 3, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_4x4c8__neonfma_zip(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, 4, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_5x4c8__neonfma_zip(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, 5, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_1x4c8__neonfma_shland(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, 1, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_2x4c8__neonfma_shland(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, 2, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_3x4c8__neonfma_shland(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, 3, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_4x4c8__neonfma_shland(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, 4, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  static void bf16_gemm_5x4c8__neonfma_shland(benchmark::State& state, const char* net) {
+    bf16_gemm(state, xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, 5, 4, 8, 1,
+      xnn_init_bf16_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
+  }
+  BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_zip)
+  BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_zip)
+  BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_zip)
+  BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_zip)
+  BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_zip)
+  BENCHMARK_GEMM(bf16_gemm_1x4c8__neonfma_shland)
+  BENCHMARK_GEMM(bf16_gemm_2x4c8__neonfma_shland)
+  BENCHMARK_GEMM(bf16_gemm_3x4c8__neonfma_shland)
+  BENCHMARK_GEMM(bf16_gemm_4x4c8__neonfma_shland)
+  BENCHMARK_GEMM(bf16_gemm_5x4c8__neonfma_shland)
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/bgemm.h ADDED Viewed

	@@ -0,0 +1,70 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <benchmark/benchmark.h>
+#define BENCHMARK_BGEMM(bgemm_fn) \
+  BENCHMARK_CAPTURE(bgemm_fn, albert, "Albert")->Apply(AlbertBgemmArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(bgemm_fn, mobilebert, "MobileBert")->Apply(MobilebertBgemmArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(bgemm_fn, sd1x_diffusion, "SD1.X Diffusion")->Apply(SD1XDiffusionBgemmArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(bgemm_fn, sd1x_encoder_decoder, "SD1.X Encoder-Decoder")->Apply(SD1XEncoderDecoderBgemmArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(bgemm_fn, sd1x_text_encoder, "SD1.X Text Encoder")->Apply(SD1XTextEncoderBgemmArguments)->UseRealTime();
+static void AlbertBgemmArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"B", "M", "N", "K"});
+  /*        B   M    N    K  */
+  b->Args({12, 384,  64, 384});
+  b->Args({12, 384, 384,  64});
+}
+static void MobilebertBgemmArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"B", "M", "N", "K"});
+  /*       B   M    N    K  */
+  b->Args({4, 384,  32, 384});
+  b->Args({4, 384, 384,  32});
+}
+static void SD1XDiffusionBgemmArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"B", "M", "N", "K"});
+  /*       B    M     N     K */
+  b->Args({8, 4096, 4096,   40});
+  b->Args({8, 4096,   40, 4096});
+  b->Args({8, 4096,   77,   40});
+  b->Args({8, 4096,   40,   77});
+  b->Args({8, 1024,  1024,  80});
+  b->Args({8, 1024,   80, 1024});
+  b->Args({8, 1024,   77,   80});
+  b->Args({8, 1024,   80,   77});
+  b->Args({8,  256,  256,  160});
+  b->Args({8,  256,  160,  256});
+  b->Args({8,  256,   77,  160});
+  b->Args({8,  256,  160,   77});
+  b->Args({8,   64,   64,  160});
+  b->Args({8,   64,  160,   64});
+  b->Args({8,   64,   77,  160});
+  b->Args({8,   64,  160,   77});
+}
+static void SD1XEncoderDecoderBgemmArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"B", "M", "N", "K"});
+  /*       B    M     N     K */
+  b->Args({1, 4096, 4096,  512});
+  b->Args({1,  512, 4096, 4096});
+}
+static void SD1XTextEncoderBgemmArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"B", "M", "N", "K"});
+  /*       B   M    N   K */
+  b->Args({12, 77, 77, 64});
+  b->Args({12, 77, 64, 77});
+}

bench/ceiling.cc ADDED Viewed

	@@ -0,0 +1,277 @@

+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <fp16/fp16.h>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+static void xnnpack_ceiling_f16(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<uint16_t> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t ceiling_op = nullptr;
+  status = xnn_create_ceiling_nc_f16(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &ceiling_op);
+  if (status != xnn_status_success || ceiling_op == nullptr) {
+    state.SkipWithError("failed to create Ceiling operator");
+    return;
+  }
+  status = xnn_reshape_ceiling_nc_f16(ceiling_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Ceiling operator");
+    return;
+  }
+  status = xnn_setup_ceiling_nc_f16(ceiling_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Ceiling operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Ceiling operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(ceiling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Ceiling operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void xnnpack_ceiling_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::vector<float> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t ceiling_op = nullptr;
+  status = xnn_create_ceiling_nc_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &ceiling_op);
+  if (status != xnn_status_success || ceiling_op == nullptr) {
+    state.SkipWithError("failed to create Ceiling operator");
+    return;
+  }
+  status = xnn_reshape_ceiling_nc_f32(ceiling_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape Ceiling operator");
+    return;
+  }
+  status = xnn_setup_ceiling_nc_f32(ceiling_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup Ceiling operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(ceiling_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run Ceiling operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(ceiling_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete Ceiling operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+static void tflite_ceiling_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  const flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_CEIL);
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+  }};
+  const std::array<int32_t, 1> op_inputs{{ 0 }};
+  const std::array<int32_t, 1> op_outputs{{ 1 }};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{ 0 }};
+  const std::array<int32_t, 1> graph_outputs{{ 1 }};
+  const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("Ceil model"),
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK(xnnpack_ceiling_f16)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_ceiling_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK(tflite_ceiling_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/channel-shuffle.cc ADDED Viewed

	@@ -0,0 +1,340 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <random>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+static void channel_shuffle_x8(benchmark::State& state, const char* net) {
+  const size_t batch_size = static_cast<size_t>(state.range(0));
+  const size_t groups = static_cast<size_t>(state.range(1));
+  const size_t group_channels = static_cast<size_t>(state.range(2));
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+  std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels);
+  std::vector<uint8_t> output(batch_size * groups * group_channels);
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t channel_shuffle_op = nullptr;
+  status = xnn_create_channel_shuffle_nc_x8(
+    groups, group_channels,
+    groups * group_channels /* input stride */,
+    groups * group_channels /* output stride */,
+    0 /* flags */, &channel_shuffle_op);
+  if (status != xnn_status_success || channel_shuffle_op == nullptr) {
+    state.SkipWithError("failed to create X8 Channel Shuffle operator");
+    return;
+  }
+  status = xnn_reshape_channel_shuffle_nc_x8(
+    channel_shuffle_op,
+    batch_size,
+    nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape X8 Channel Shuffle operator");
+    return;
+  }
+  status = xnn_setup_channel_shuffle_nc_x8(
+    channel_shuffle_op,
+    input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup X8 Channel Shuffle operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run X8 Channel Shuffle operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(channel_shuffle_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete X8 Channel Shuffle operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = batch_size * groups * group_channels;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void channel_shuffle_x32(benchmark::State& state, const char* net) {
+  const size_t batch_size = static_cast<size_t>(state.range(0));
+  const size_t groups = static_cast<size_t>(state.range(1));
+  const size_t group_channels = static_cast<size_t>(state.range(2));
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + batch_size * groups * group_channels);
+  std::vector<float> output(batch_size * groups * group_channels);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t channel_shuffle_op = nullptr;
+  status = xnn_create_channel_shuffle_nc_x32(
+    groups, group_channels,
+    groups * group_channels /* input stride */,
+    groups * group_channels /* output stride */,
+    0 /* flags */, &channel_shuffle_op);
+  if (status != xnn_status_success || channel_shuffle_op == nullptr) {
+    state.SkipWithError("failed to create X32 Channel Shuffle operator");
+    return;
+  }
+  status = xnn_reshape_channel_shuffle_nc_x32(
+    channel_shuffle_op,
+    batch_size,
+    nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape X32 Channel Shuffle operator");
+    return;
+  }
+  status = xnn_setup_channel_shuffle_nc_x32(
+    channel_shuffle_op,
+    input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup X32 Channel Shuffle operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(channel_shuffle_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run X32 Channel Shuffle operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(channel_shuffle_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete X32 Channel Shuffle operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = batch_size * groups * group_channels;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void ShuffleNetV1G2Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 ********/
+  /*        H    W  G   CG */
+  b->Args({56 * 56, 2,  25});
+  b->Args({28 * 28, 2,  25});
+  /******** Stage 3 ********/
+  /*        H    W  G   CG */
+  b->Args({28 * 28, 2,  50});
+  b->Args({14 * 14, 2,  50});
+  /******** Stage 4 ********/
+  /*        H    W  G   CG */
+  b->Args({14 * 14, 2, 100});
+  b->Args({ 7 *  7, 2, 100});
+}
+static void ShuffleNetV1G3Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 *******/
+  /*        H    W  G  CG */
+  b->Args({56 * 56, 3, 20});
+  b->Args({28 * 28, 3, 20});
+  /******** Stage 3 *******/
+  /*        H    W  G  CG */
+  b->Args({28 * 28, 3, 40});
+  b->Args({14 * 14, 3, 40});
+  /******** Stage 4 *******/
+  /*        H    W  G  CG */
+  b->Args({14 * 14, 3, 80});
+  b->Args({ 7 *  7, 3, 80});
+}
+static void ShuffleNetV1G4Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 *******/
+  /*        H    W  G  CG */
+  b->Args({56 * 56, 4, 17});
+  b->Args({28 * 28, 4, 17});
+  /******** Stage 3 *******/
+  /*        H    W  G  CG */
+  b->Args({28 * 28, 4, 34});
+  b->Args({14 * 14, 4, 34});
+  /******** Stage 4 *******/
+  /*        H    W  G  CG */
+  b->Args({14 * 14, 4, 68});
+  b->Args({ 7 *  7, 4, 68});
+}
+static void ShuffleNetV1G8Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 *******/
+  /*        H    W  G  CG */
+  b->Args({56 * 56, 8, 12});
+  b->Args({28 * 28, 8, 12});
+  /******** Stage 3 *******/
+  /*        H    W  G  CG */
+  b->Args({28 * 28, 8, 24});
+  b->Args({14 * 14, 8, 24});
+  /******** Stage 4 *******/
+  /*        H    W  G  CG */
+  b->Args({14 * 14, 8, 48});
+  b->Args({ 7 *  7, 8, 48});
+}
+static void ShuffleNetV2x0_5Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 *******/
+  /*        H    W  G  CG */
+  b->Args({28 * 28, 2, 24});
+  /******** Stage 3 *******/
+  /*        H    W  G  CG */
+  b->Args({14 * 14, 2, 48});
+  /******** Stage 4 *******/
+  /*        H    W  G  CG */
+  b->Args({ 7 *  7, 2, 96});
+}
+static void ShuffleNetV2x1_0Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 ********/
+  /*        H    W  G   CG */
+  b->Args({28 * 28, 2,  58});
+  /******** Stage 3 ********/
+  /*        H    W  G   CG */
+  b->Args({14 * 14, 2, 116});
+  /******** Stage 4 ********/
+  /*        H    W  G   CG */
+  b->Args({ 7 *  7, 2, 232});
+}
+static void ShuffleNetV2x1_5Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 ********/
+  /*        H    W  G   CG */
+  b->Args({28 * 28, 2,  88});
+  /******** Stage 3 ********/
+  /*        H    W  G   CG */
+  b->Args({14 * 14, 2, 176});
+  /******** Stage 4 ********/
+  /*        H    W  G   CG */
+  b->Args({ 7 *  7, 2, 352});
+}
+static void ShuffleNetV2x2_0Arguments(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"N", "G", "GC"});
+  /******** Stage 2 ********/
+  /*        H    W  G   CG */
+  b->Args({28 * 28, 2, 122});
+  /******** Stage 3 ********/
+  /*        H    W  G   CG */
+  b->Args({14 * 14, 2, 244});
+  /******** Stage 4 ********/
+  /*        H    W  G   CG */
+  b->Args({ 7 *  7, 2, 488});
+}
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
+BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/conv.h ADDED Viewed

	@@ -0,0 +1,852 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <benchmark/benchmark.h>
+#define BENCHMARK_CONV(conv_fn) \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, inception_v3, "Inception v3")->Apply(InceptionV3ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, resnet18, "ResNet-18")->Apply(ResNet18ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, resnet50, "ResNet-50")->Apply(ResNet50ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, vgg, "VGG")->Apply(VGGConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955ConvArguments)->UseRealTime();
+// ShuffleNet v1 with 1 group.
+static void ShuffleNetV1G1ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /*************** Stage 2: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   36});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   36,  120});
+  /*************** Stage 2: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  144,   36});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   36,  144});
+  /*************** Stage 3: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  144,   72});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   72,  144});
+  /*************** Stage 3: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  288,   72});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   72,  288});
+  /*************** Stage 4: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  288,  144});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  144,  288});
+  /*************** Stage 4: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  576,  144});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  144,  576});
+}
+// ShuffleNet v1 with 2 groups.
+static void ShuffleNetV1G2ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /*************** Stage 2: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   50});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   25,   88});
+  /*************** Stage 2: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  100,   25});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   25,  100});
+  /*************** Stage 3: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  100,   50});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   50,  100});
+  /*************** Stage 3: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  200,   50});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   50,  200});
+  /*************** Stage 4: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  200,  100});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  100,  200});
+  /*************** Stage 4: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  400,  100});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  100,  400});
+}
+// ShuffleNet v1 with 3 groups.
+static void ShuffleNetV1G3ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /*************** Stage 2: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   60});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   20,   72});
+  /*************** Stage 2: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   80,   20});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   20,   80});
+  /*************** Stage 3: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   80,   40});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   40,   80});
+  /*************** Stage 3: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  160,   40});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   40,  160});
+  /*************** Stage 4: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  160,   80});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   80,  160});
+  /*************** Stage 4: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  320,   80});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   80,  320});
+}
+// ShuffleNet v1 with 4 groups.
+static void ShuffleNetV1G4ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /*************** Stage 2: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   68});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   17,   62});
+  /*************** Stage 2: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   68,   17});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   17,   68});
+  /*************** Stage 3: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   68,   34});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   34,   68});
+  /*************** Stage 3: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  136,   34});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   34,  136});
+  /*************** Stage 4: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  136,   68});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   68,  136});
+  /*************** Stage 4: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  272,   68});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   68,  272});
+}
+// ShuffleNet v1 with 8 groups.
+static void ShuffleNetV1G8ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /*************** Stage 2: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   96});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   12,   45});
+  /*************** Stage 2: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   48,   12});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   12,   48});
+  /*************** Stage 3: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   48,   24});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   24,   48});
+  /*************** Stage 3: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,   24});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   24,   96});
+  /*************** Stage 4: stride-2 unit **************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,   48});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   48,   96});
+  /*************** Stage 4: stride-1 units *************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  192,   48});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   48,  192});
+}
+// ShuffleNet v2 (0.5X scale).
+static void ShuffleNetV2X05ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /********************** Stage 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,   24});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   24});
+  /********************** Stage 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   48,   48});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   48,   48});
+  /********************** Stage 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   96,   96});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,   96});
+  /*********************** Conv 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  192, 1024});
+}
+// ShuffleNet v2 (1.0X scale).
+static void ShuffleNetV2X10ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /********************** Stage 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,   58});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   58});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   58,   58});
+  /********************** Stage 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  116,  116});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  116,  116});
+  /********************** Stage 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  232,  232});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  232,  232});
+  /*********************** Conv 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  464, 1024});
+}
+// ShuffleNet v2 (1.5X scale).
+static void ShuffleNetV2X15ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /********************** Stage 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,   88});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   88});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   88,   88});
+  /********************** Stage 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  176,  176});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  176,  176});
+  /********************** Stage 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  352,  352});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  352,  352});
+  /*********************** Conv 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  704, 1024});
+}
+// ShuffleNet v2 (2.0X scale).
+static void ShuffleNetV2X20ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   24});
+  /********************** Stage 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,  122});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,  122});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  122,  122});
+  /********************** Stage 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  244,  244});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  244,  244});
+  /********************** Stage 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  488,  488});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  488,  488});
+  /*********************** Conv 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  976, 2048});
+}
+static void MobileNetV1ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   32});
+  b->Args({112, 112,  1,  1,  0,  0, 1, 1,   32,   64});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,  128});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,  128,  128});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  128,  256});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  256,  256});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  256,  512});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  512,  512});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  512, 1024});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1, 1024, 1024});
+}
+static void MobileNetV2ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   32});
+  /******************** Bottleneck 1 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  1,  1,  0,  0, 1, 1,   32,   16});
+  /******************** Bottleneck 2 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  1,  1,  0,  0, 1, 1,   16,   96});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   96,   24});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,  144});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,  144,   24});
+  /******************** Bottleneck 3 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,  144});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  144,   32});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   32,  192});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  192,   32});
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   32,  192});
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  192,   32});
+  /******************** Bottleneck 4 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   32,  192});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  192,   64});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   64,  384});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  384,   64});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   64,  384});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  384,   64});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   64,  384});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  384,   64});
+  /******************** Bottleneck 5 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   64,  384});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  384,   96});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,  576});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  576,   96});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,  576});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  576,   96});
+  /******************** Bottleneck 6 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,  576});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  576,  160});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  960,  160});
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  960,  160});
+  /******************** Bottleneck 7 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  960,  320});
+  /**************** Pre-pooling Conv2D *****************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  320, 1280});
+  /**************** Post-pooling Conv2D ****************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1, 1280, 1000});
+}
+static void MobileNetV3SmallConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /******************* Initial Stage *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   16});
+  /******************** Bottleneck 1 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   16,    8});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,    8,   16});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   16,   16});
+  /******************** Bottleneck 2 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   16,   72});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   72,   24});
+  /******************** Bottleneck 3 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,   88});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   88,   24});
+  /******************** Bottleneck 4 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   24,   96});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   96,   24});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   24,   96});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   96,   40});
+  /******************** Bottleneck 5 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   40,  240});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  240,   64});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   64,  240});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  240,   40});
+  /******************** Bottleneck 6 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   40,  240});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  240,   64});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   64,  240});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  240,   40});
+  /******************** Bottleneck 7 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   40,  120});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  120,   32});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   32,  120});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  120,   48});
+  /******************** Bottleneck 8 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   48,  144});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  144,   40});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   40,  144});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  144,   48});
+  /******************** Bottleneck 9 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   48,  288});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  288,   72});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   72,  288});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  288,   96});
+  /******************* Bottleneck 10 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   96,  576});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  576,  144});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  144,  576});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  576,   96});
+  /******************* Bottleneck 11 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   96,  576});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  576,  144});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  144,  576});
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  576,   96});
+  /********************* Last Stage ********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,   96,  576});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  576, 1024});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1, 1024, 1001});
+}
+static void MobileNetV3LargeConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /******************* Initial Stage *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   16});
+  /******************** Bottleneck 1 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  1,  1,  0,  0, 1, 1,   16,   16});
+  /******************** Bottleneck 2 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  1,  1,  0,  0, 1, 1,   16,   64});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,   24});
+  /******************** Bottleneck 3 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   72});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   72,   24});
+  /******************** Bottleneck 4 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   24,   72});*/
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   72,   24});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   24,   72});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   72,   40});
+  /******************** Bottleneck 5 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   40,  120});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  120,   32});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   32,  120});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  120,   40});
+  /******************** Bottleneck 6 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   40,  120});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  120,   32});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,   32,  120});
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  120,   40});
+  /******************** Bottleneck 7 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,   40,  240});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  240,   80});
+  /******************** Bottleneck 8 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   80,  200});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  200,   80});
+  /******************** Bottleneck 9 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   80,  184});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  184,   80});
+  /******************* Bottleneck 10 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   80,  184});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  184,   80});
+  /******************* Bottleneck 11 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,   80,  480});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  480,  120});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  120,  480});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  480,  112});
+  /******************* Bottleneck 12 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  112,  672});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  672,  168});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  168,  672});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  672,  112});
+  /******************* Bottleneck 13 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  112,  672});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  672,  160});
+  /******************* Bottleneck 14 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  960,  240});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  240,  960});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  960,  160});
+  /******************* Bottleneck 15 *******************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  960,  240});
+//b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  240,  960});
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  960,  160});
+  /******************** Last Stage *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  160,  960});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1,  960, 1280});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1, 1280, 1001});
+}
+// SqueezeNet 1.0
+static void SqueezeNetV10ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  7,  7,  6,  6, 2, 1,    3,   96});
+  /*********************** Fire 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   96,   16});
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   16,   64});
+  b->Args({ 55,  55,  3,  3,  2,  2, 1, 1,   16,   64});
+  /*********************** Fire 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  55,  1,  1,  0,  0, 1, 1,  128,   16});
+//b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   16,   64});
+//b->Args({ 55,  55,  3,  3,  2,  2, 1, 1,   16,   64});
+  /*********************** Fire 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,  128,   32});
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   32,  128});
+  b->Args({ 55,  55,  3,  3,  2,  2, 1, 1,   32,  128});
+  /*********************** Fire 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  256,   32});
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   32,  128});
+  b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   32,  128});
+  /*********************** Fire 6 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  256,   48});
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   48,  192});
+  b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   48,  192});
+  /*********************** Fire 7 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  384,   48});
+//b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   48,  192});
+//b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   48,  192});
+  /*********************** Fire 8 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  384,   64});
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   64,  256});
+  b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   64,  256});
+  /*********************** Fire 9 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  512,   64});
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,   64,  256});
+  b->Args({ 13,  13,  3,  3,  2,  2, 1, 1,   64,  256});
+  /********************** Conv 10 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  512, 1000});
+}
+// SqueezeNet 1.1
+static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 2, 1,    3,   64});
+  /*********************** Fire 2 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   64,   16});
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   16,   64});
+  b->Args({ 55,  55,  3,  3,  2,  2, 1, 1,   16,   64});
+  /*********************** Fire 3 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,  128,   16});
+//b->Args({ 55,  55,  1,  1,  0,  0, 1, 1,   16,   64});
+//b->Args({ 55,  55,  3,  3,  2,  2, 1, 1,   16,   64});
+  /*********************** Fire 4 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  128,   32});
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   32,  128});
+  b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   32,  128});
+  /*********************** Fire 5 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,  256,   32});
+//b->Args({ 27,  27,  1,  1,  0,  0, 1, 1,   32,  128});
+//b->Args({ 27,  27,  3,  3,  2,  2, 1, 1,   32,  128});
+  /*********************** Fire 6 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  256,   48});
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,   48,  192});
+  b->Args({ 13,  13,  3,  3,  2,  2, 1, 1,   48,  192});
+  /*********************** Fire 7 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  384,   48});
+//b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,   48,  192});
+//b->Args({ 13,  13,  3,  3,  2,  2, 1, 1,   48,  192});
+  /*********************** Fire 8 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  384,   64});
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,   64,  256});
+  b->Args({ 13,  13,  3,  3,  2,  2, 1, 1,   64,  256});
+  /*********************** Fire 9 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  512,   64});
+//b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,   64,  256});
+//b->Args({ 13,  13,  3,  3,  2,  2, 1, 1,   64,  256});
+  /********************** Conv 10 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 13,  13,  1,  1,  0,  0, 1, 1,  512, 1000});
+}
+static void InceptionV3ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({299, 299,  3,  3,  0,  0, 2, 1,    3,   32});
+  b->Args({149, 149,  3,  3,  0,  0, 1, 1,   32,   32});
+  b->Args({147, 147,  3,  3,  2,  2, 1, 1,   32,   64});
+  b->Args({ 73,  73,  1,  1,  0,  0, 1, 1,   64,   80});
+  b->Args({ 73,  73,  3,  3,  0,  0, 1, 1,   80,  192});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  192,   64});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  192,   48});
+  b->Args({ 35,  35,  5,  5,  4,  4, 1, 1,   48,   64});
+  b->Args({ 35,  35,  3,  3,  2,  2, 1, 1,   64,   96});
+  b->Args({ 35,  35,  3,  3,  2,  2, 1, 1,   96,   96});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  192,   32});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  256,   64});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  256,   48});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  288,   64});
+  b->Args({ 35,  35,  1,  1,  0,  0, 1, 1,  288,   48});
+  b->Args({ 35,  35,  3,  3,  0,  0, 2, 1,  288,  384});
+  b->Args({ 35,  35,  3,  3,  0,  0, 2, 1,   96,   96});
+  b->Args({ 17,  17,  1,  1,  0,  0, 1, 1,  768,  192});
+  b->Args({ 17,  17,  1,  1,  0,  0, 1, 1,  768,  128});
+  b->Args({ 17,  17,  1,  7,  0,  6, 1, 1,  128,  128});
+  b->Args({ 17,  17,  7,  1,  6,  0, 1, 1,  128,  192});
+  b->Args({ 17,  17,  7,  1,  6,  0, 1, 1,  128,  128});
+  b->Args({ 17,  17,  1,  7,  0,  6, 1, 1,  128,  192});
+  b->Args({ 17,  17,  1,  1,  0,  0, 1, 1,  768,  160});
+  b->Args({ 17,  17,  1,  7,  0,  6, 1, 1,  160,  160});
+  b->Args({ 17,  17,  7,  1,  6,  0, 1, 1,  160,  192});
+  b->Args({ 17,  17,  7,  1,  6,  0, 1, 1,  160,  160});
+  b->Args({ 17,  17,  1,  7,  0,  6, 1, 1,  160,  192});
+  b->Args({ 17,  17,  1,  7,  0,  6, 1, 1,  192,  192});
+  b->Args({ 17,  17,  7,  1,  6,  0, 1, 1,  192,  192});
+  b->Args({ 17,  17,  3,  3,  0,  0, 2, 1,  192,  320});
+  b->Args({ 17,  17,  3,  3,  0,  0, 2, 1,  192,  192});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 1280,  320});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 1280,  384});
+  b->Args({  8,   8,  1,  3,  0,  2, 1, 1,  384,  384});
+  b->Args({  8,   8,  3,  1,  2,  0, 1, 1,  384,  384});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 1280,  448});
+  b->Args({  8,   8,  3,  3,  2,  2, 1, 1,  448,  384});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 1280,  192});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 2048,  320});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 2048,  384});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 2048,  448});
+  b->Args({  8,   8,  1,  1,  0,  0, 1, 1, 2048,  192});
+  b->Args({  1,   1,  1,  1,  0,  0, 1, 1, 2048, 1001});
+}
+static void ResNet18ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /********************** Conv 1 ***********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  7,  7,  6,  6, 2, 1,    3,   64});
+  /********************* Conv 2.X **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,   64,   64});
+  /********************* Conv 3.X **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  3,  3,  2,  2, 2, 1,   64,  128});
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  128,  128});
+  b->Args({ 56,  56,  1,  1,  0,  0, 2, 1,   64,  128});
+  /********************* Conv 4.X **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  3,  3,  2,  2, 2, 1,  128,  256});
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1,  256,  256});
+  b->Args({ 28,  28,  1,  1,  0,  0, 2, 1,  128,  256});
+  /********************* Conv 5.X **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  3,  3,  2,  2, 2, 1,  256,  512});
+  b->Args({  7,   7,  3,  3,  2,  2, 1, 1,  512,  512});
+  b->Args({ 14,  14,  1,  1,  0,  0, 2, 1,  256,  512});
+}
+static void ResNet50ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /********************** Conv 1 ***********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  7,  7,  6,  6, 2, 1,    3,   64});
+  /********************* Conv 2.1 **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,   64});
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,   64,   64});
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,  256});
+//b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,  256});
+  /********************* Conv 2.X **********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,  256,   64});
+//b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,   64,   64});
+//b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,   64,  256});
+  /********************** Conv 3.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,  256,  128});
+  b->Args({ 56,  56,  3,  3,  2,  2, 2, 1,  128,  128});
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  128,  512});
+  b->Args({ 56,  56,  1,  1,  0,  0, 2, 1,  256,  512});
+  /********************** Conv 3.X *********************/
+  /*         H    W   KH  KW PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  512,  128});
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  128,  128});
+//b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  128,  512});
+  /********************** Conv 4.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  512,  256});
+  b->Args({ 28,  28,  3,  3,  2,  2, 2, 1,  256,  256});
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  256, 1024});
+  b->Args({ 28,  28,  1,  1,  0,  0, 2, 1,  512, 1024});
+  /********************** Conv 4.X *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1, 1024,  256});
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1,  256,  256});
+//b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  256, 1024});
+  /********************** Conv 5.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1, 1024,  512});
+  b->Args({ 14,  14,  3,  3,  2,  2, 2, 1,  512,  512});
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  512, 2048});
+  b->Args({ 14,  14,  1,  1,  0,  0, 2, 1, 1024, 2048});
+  /********************** Conv 5.X *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({  7,   7,  1,  1,  0,  0, 1, 1, 2048,  512});
+  b->Args({  7,   7,  3,  3,  2,  2, 1, 1,  512,  512});
+//b->Args({  7,   7,  1,  1,  0,  0, 1, 1,  512, 2048});
+}
+static void VGGConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /********************** Conv 1.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 1, 1,    3,   64});
+  /********************** Conv 1.2 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({224, 224,  3,  3,  2,  2, 1, 1,   64,   64});
+  /********************** Conv 2.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  3,  3,  2,  2, 1, 1,   64,  128});
+  /********************** Conv 2.2 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({112, 112,  3,  3,  2,  2, 1, 1,  128,  128});
+  /********************** Conv 3.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,  128,  256});
+  /********************** Conv 3.2 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,  256,  256});
+  /********************** Conv 3.3 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 56,  56,  1,  1,  0,  0, 1, 1,  256,  256});
+  /********************** Conv 4.1 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  256,  512});
+  /********************** Conv 4.2 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  512,  512});
+  /********************** Conv 4.3 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 28,  28,  1,  1,  0,  0, 1, 1,  512,  512});
+  /********************** Conv 5.X *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1,  512,  512});
+  /********************** Conv 5.3 *********************/
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({ 14,  14,  1,  1,  0,  0, 1, 1,  512,  512});
+}
+// SRCNN (9-1-5)
+static void SRCNN915ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({384, 384,  9,  9,  0,  0, 1, 1,    1,   64});
+  b->Args({376, 376,  1,  1,  0,  0, 1, 1,   64,   32});
+  b->Args({376, 376,  5,  5,  0,  0, 1, 1,   32,    1});
+}
+// SRCNN (9-3-5)
+static void SRCNN935ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({384, 384,  9,  9,  0,  0, 1, 1,    1,   64});
+  b->Args({376, 376,  3,  3,  0,  0, 1, 1,   64,   32});
+  b->Args({374, 374,  5,  5,  0,  0, 1, 1,   32,    1});
+}
+// SRCNN (9-5-5)
+static void SRCNN955ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "GCin", "GCout"});
+  /*        H    W   KH  KW  PH  PW  S  D  GCin  GCout */
+  b->Args({384, 384,  9,  9,  0,  0, 1, 1,    1,   64});
+  b->Args({376, 376,  5,  5,  0,  0, 1, 1,   64,   32});
+  b->Args({372, 372,  5,  5,  0,  0, 1, 1,   32,    1});
+}

bench/convert.cc ADDED Viewed

	@@ -0,0 +1,1339 @@

+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+void xnnpack_convert_f16_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::vector<float> output(batch_size);
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_f16_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create F16->F32 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_f16_f32(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape F16->F32 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_f16_f32(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup F16->F32 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run F16->F32 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete F16->F32 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_f32_f16(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<uint16_t> output(batch_size);
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_f32_f16(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create F32->F16 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_f32_f16(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape F32->F16 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_f32_f16(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup F32->F16 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run F32->F16 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete F32->F16 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint16_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_f32_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<int8_t> output(batch_size);
+  std::fill(output.begin(), output.end(), 0);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_f32_qs8(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f / 128.0f /* scale */, 1 /* zero point */,
+    std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create F32->QS8 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_f32_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape F32->QS8 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_f32_qs8(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup F32->QS8 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run F32->QS8 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete F32->QS8 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_f32_qu8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<uint8_t> output(batch_size);
+  std::fill(output.begin(), output.end(), 0);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_f32_qu8(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f / 128.0f /* scale */, 127 /* zero point */,
+    std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max(),
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create F32->QU8 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_f32_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape F32->QU8 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_f32_qu8(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup F32->QU8 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run F32->QU8 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete F32->QU8 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
+  std::generate(input.begin(), input.end(), std::ref(i8rng));
+  std::vector<int8_t> output(batch_size);
+  std::fill(output.begin(), output.end(), INT8_C(0xAA));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_qs8(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0.75f /* input scale */, -1 /* input zero point */,
+    0.5f /* output scale */, 1 /* output zero point */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create QS8 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_qs8(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape QS8 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_qs8(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup QS8 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QS8 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete QS8 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_qs8_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
+  std::generate(input.begin(), input.end(), std::ref(i8rng));
+  std::vector<float> output(batch_size);
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_qs8_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f / 255.0f /* scale */, -128 /* zero point */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create QS8->F32 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_qs8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape QS8->F32 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_qs8_f32(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup QS8->F32 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QS8->F32 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete QS8->F32 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_qu8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+  std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::vector<uint8_t> output(batch_size);
+  std::fill(output.begin(), output.end(), UINT8_C(0xAA));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_qu8(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    0.75f /* scale */, 125 /* zero point */,
+    0.5f /* scale */, 130 /* zero point */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create QU8 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_qu8(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape QU8 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_qu8(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup QU8 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QU8 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete QU8 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+void xnnpack_convert_qu8_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+  std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::vector<float> output(batch_size);
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t convert_op = nullptr;
+  status = xnn_create_convert_nc_qu8_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f / 128.0f /* scale */, 128 /* zero point */,
+    0 /* flags */, &convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to create QU8->F32 Convert operator");
+    return;
+  }
+  status = xnn_reshape_convert_nc_qu8_f32(convert_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape QU8->F32 Convert operator");
+    return;
+  }
+  status = xnn_setup_convert_nc_qu8_f32(convert_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup QU8->F32 Convert operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(convert_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QU8->F32 Convert operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(convert_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete QU8->F32 Convert operator");
+    return;
+  }
+  convert_op = nullptr;
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+void tflite_convert_f16_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT16),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32)
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  uint16_t* input_data = reinterpret_cast<uint16_t*>(interpreter->tensor(0)->data.data);
+  std::generate_n(input_data, batch_size, std::ref(f16rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_f32_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
+                           builder.CreateVector<int64_t>({1 /* zero point */})))
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_f32_qu8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
+                           builder.CreateVector<int64_t>({127 /* zero point */})))
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<float>(0), batch_size, std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({0.75f /* scale */}),
+                           builder.CreateVector<int64_t>({-1 /* zero point */}))),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({0.5f /* scale */}),
+                           builder.CreateVector<int64_t>({1 /* zero point */}))),
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_qs8_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f / 255.0f /* scale */}),
+                           builder.CreateVector<int64_t>({-128 /* zero point */}))),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32)
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<int8_t>(0), batch_size, std::ref(i8rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_qu8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({0.75f /* scale */}),
+                           builder.CreateVector<int64_t>({125 /* zero point */}))),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({0.5f /* scale */}),
+                           builder.CreateVector<int64_t>({130 /* zero point */})))
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+void tflite_convert_qu8_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
+  std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
+                           builder.CreateVector<int64_t>({128 /* zero point */}))),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32)
+  }};
+  const std::array<int32_t, 1> op_inputs{{0}};
+  const std::array<int32_t, 1> op_outputs{{1}};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{0}};
+  const std::array<int32_t, 1> graph_outputs{{1}};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate_n(interpreter->typed_tensor<uint8_t>(0), batch_size, std::ref(u8rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK(xnnpack_convert_f16_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_f32_f16)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint16_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_f32_qs8)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_f32_qu8)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_qs8)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_qs8_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_qu8)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_convert_qu8_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK(tflite_convert_f16_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_f32_qs8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_f32_qu8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_qs8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_qs8_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_qu8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK(tflite_convert_qu8_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/convolution.cc ADDED Viewed

	@@ -0,0 +1,1768 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <string>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#include "bench/utils.h"
+void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t subsampling = state.range(7);
+  const size_t dilation = state.range(8);
+  const size_t groups = state.range(9);
+  const size_t group_input_channels = state.range(10);
+  const size_t group_output_channels = state.range(11);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
+  auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+  const size_t output_pixel_stride = groups * group_output_channels;
+  const size_t input_pixel_stride = groups * group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
+  std::vector<int32_t> bias(groups * group_output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(i32rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
+  std::vector<uint8_t> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> convolution_operators(num_buffers);
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_create_convolution2d_nhwc_qu8(
+      padding_top, padding_right, padding_bottom, padding_left,
+      kernel_height, kernel_width,
+      subsampling, subsampling,
+      dilation, dilation,
+      groups, group_input_channels, group_output_channels,
+      input_pixel_stride, output_pixel_stride,
+      127, 0.5f,
+      127, 0.5f,
+      kernel.data(), bias.data(),
+      127, 0.5f, 0, 255,
+      0 /* flags */, nullptr, nullptr, &convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create QUINT8 Convolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < convolution_operators.size(); i++) {
+    status = xnn_reshape_convolution2d_nhwc_qu8(
+      convolution_operators[i],
+      batch_size, input_height, input_width,
+      /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+      nullptr /* thread pool */);
+    status = xnn_setup_convolution2d_nhwc_qu8(
+      convolution_operators[i],
+      input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QUINT8 Convolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(convolution_operators[buffer_index],
+      nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QUINT8 Convolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_delete_operator(convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete QUINT8 Convolution operator");
+      return;
+    }
+    convolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["OPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * output_height * output_width *
+      groups * group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t subsampling = state.range(7);
+  const size_t dilation = state.range(8);
+  const size_t groups = state.range(9);
+  const size_t group_input_channels = state.range(10);
+  const size_t group_output_channels = state.range(11);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+  const size_t output_pixel_stride = groups * group_output_channels;
+  const size_t input_pixel_stride = groups * group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);
+  std::generate(input.begin(), input.end(), std::ref(i8rng));
+  std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
+  std::vector<int32_t> bias(groups * group_output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(i32rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);
+  std::vector<int8_t> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> convolution_operators(num_buffers);
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_create_convolution2d_nhwc_qs8(
+      padding_top, padding_right, padding_bottom, padding_left,
+      kernel_height, kernel_width,
+      subsampling, subsampling,
+      dilation, dilation,
+      groups, group_input_channels, group_output_channels,
+      input_pixel_stride, output_pixel_stride,
+      127, 0.5f, 0.5f,
+      kernel.data(), bias.data(),
+      127, 0.5f, -128, 127,
+      0 /* flags */, nullptr, nullptr, &convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create QINT8 Convolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < convolution_operators.size(); i++) {
+    status = xnn_reshape_convolution2d_nhwc_qs8(
+      convolution_operators[i],
+      batch_size, input_height, input_width,
+      /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+      nullptr /* thread pool */);
+    status = xnn_setup_convolution2d_nhwc_qs8(
+      convolution_operators[i],
+      input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QINT8 Convolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(convolution_operators[buffer_index],
+      nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QINT8 Convolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_delete_operator(convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete QINT8 Convolution operator");
+      return;
+    }
+    convolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["OPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * output_height * output_width *
+      groups * group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t subsampling = state.range(7);
+  const size_t dilation = state.range(8);
+  const size_t groups = state.range(9);
+  const size_t group_input_channels = state.range(10);
+  const size_t group_output_channels = state.range(11);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t output_pixel_stride = groups * group_output_channels;
+  const size_t input_pixel_stride = groups * group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
+  std::vector<uint16_t> bias(groups * group_output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f16rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
+  std::vector<uint16_t> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> convolution_operators(num_buffers);
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_create_convolution2d_nhwc_f16(
+      padding_top, padding_right, padding_bottom, padding_left,
+      kernel_height, kernel_width,
+      subsampling, subsampling,
+      dilation, dilation,
+      groups, group_input_channels, group_output_channels,
+      input_pixel_stride, output_pixel_stride,
+      kernel.data(), bias.data(),
+      -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
+      0 /* flags */, nullptr, nullptr, &convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create FP16 Convolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < convolution_operators.size(); i++) {
+    status = xnn_reshape_convolution2d_nhwc_f16(
+      convolution_operators[i],
+      batch_size, input_height, input_width,
+      /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+      nullptr /* thread pool */);
+    status = xnn_setup_convolution2d_nhwc_f16(
+      convolution_operators[i],
+      input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup FP16 Convolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run FP16 Convolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_delete_operator(convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete FP16 Convolution operator");
+      return;
+    }
+    convolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * output_height * output_width *
+      groups * group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t subsampling = state.range(7);
+  const size_t dilation = state.range(8);
+  const size_t groups = state.range(9);
+  const size_t group_input_channels = state.range(10);
+  const size_t group_output_channels = state.range(11);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  const size_t output_pixel_stride = groups * group_output_channels;
+  const size_t input_pixel_stride = groups * group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
+  std::vector<float> bias(groups * group_output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f32rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(float) * (kernel.size() + bias.size() + output_elements));
+  std::vector<float> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> convolution_operators(num_buffers);
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_create_convolution2d_nhwc_f32(
+      padding_top, padding_right, padding_bottom, padding_left,
+      kernel_height, kernel_width,
+      subsampling, subsampling,
+      dilation, dilation,
+      groups, group_input_channels, group_output_channels,
+      input_pixel_stride, output_pixel_stride,
+      kernel.data(), bias.data(),
+      -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
+      0 /* flags */, nullptr, nullptr, &convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create FP32 Convolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < convolution_operators.size(); i++) {
+    status = xnn_reshape_convolution2d_nhwc_f32(
+      convolution_operators[i],
+      batch_size, input_height, input_width,
+      /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+      nullptr /* thread pool */);
+    status = xnn_setup_convolution2d_nhwc_f32(
+      convolution_operators[i],
+      input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup FP32 Convolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run FP32 Convolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& convolution_op : convolution_operators) {
+    status = xnn_delete_operator(convolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete FP32 Convolution operator");
+      return;
+    }
+    convolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * output_height * output_width *
+      groups * group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+void tflite_convolution_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t subsampling = state.range(7);
+  const size_t dilation = state.range(8);
+  const size_t groups = state.range(9);
+  const size_t group_input_channels = state.range(10);
+  const size_t group_output_channels = state.range(11);
+  bool is_depthwise = false;
+  if (groups != 1) {
+    if (group_input_channels == 1) {
+      is_depthwise = true;
+    } else {
+      state.SkipWithError("grouped convolution is not supported");
+      return;
+    }
+  }
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  tflite::Padding padding = tflite::Padding_VALID;
+  if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
+    padding = tflite::Padding_SAME;
+  } else if (padding_width == 0 && padding_height == 0) {
+    padding = tflite::Padding_VALID;
+  } else {
+    state.SkipWithError("unsupported padding");
+    return;
+  }
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
+  std::vector<float> bias(groups * group_output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f32rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(
+        builder,
+        is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
+        0);
+  flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
+      builder,
+      padding,
+      static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
+      tflite::ActivationFunctionType_NONE,
+      static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
+  flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
+      builder,
+      padding,
+      static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
+      static_cast<int32_t>(group_output_channels),
+      tflite::ActivationFunctionType_NONE,
+      static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
+  flatbuffers::Offset<tflite::Buffer> buffers[3] = {
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+    tflite::CreateBuffer(builder, builder.CreateVector(
+      reinterpret_cast<const uint8_t*>(kernel.data()),
+      sizeof(float) * kernel.size())),
+    tflite::CreateBuffer(builder, builder.CreateVector(
+      reinterpret_cast<const uint8_t*>(bias.data()),
+      sizeof(float) * bias.size())),
+  };
+  const int32_t input_shape[4] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(input_height),
+    static_cast<int32_t>(input_width),
+    static_cast<int32_t>(groups * group_input_channels)
+  };
+  const int32_t output_shape[4] = {
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(output_height),
+    static_cast<int32_t>(output_width),
+    static_cast<int32_t>(groups * group_output_channels)
+  };
+  const int32_t filter_shape[4] = {
+    static_cast<int32_t>(group_output_channels),
+    static_cast<int32_t>(kernel_height),
+    static_cast<int32_t>(kernel_width),
+    static_cast<int32_t>(groups * group_input_channels)
+  };
+  const int32_t bias_shape[1] = {
+    static_cast<int32_t>(groups * group_output_channels)
+  };
+  flatbuffers::Offset<tflite::Tensor> tensors[4] = {
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(input_shape, 4),
+                         tflite::TensorType_FLOAT32,
+                         0 /* buffer id */,
+                         builder.CreateString("input")),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(filter_shape, 4),
+                         tflite::TensorType_FLOAT32,
+                         1 /* buffer id */,
+                         builder.CreateString("filter")),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(bias_shape, 1),
+                         tflite::TensorType_FLOAT32,
+                         2 /* buffer id */,
+                         builder.CreateString("bias")),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(output_shape, 4),
+                         tflite::TensorType_FLOAT32,
+                         0 /* buffer id */,
+                         builder.CreateString("output")),
+  };
+  const int32_t op_inputs[3] = { 0, 1, 2 };
+  const int32_t op_outputs[1] = { 3 };
+  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs, 3),
+      builder.CreateVector<int32_t>(op_outputs, 1),
+      is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
+      is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
+      /*custom_options */ 0,
+      tflite::CustomOptionsFormat_FLEXBUFFERS);
+  const int32_t graph_inputs[1] = { 0 };
+  const int32_t graph_outputs[1] = { 3 };
+  flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors, 4),
+      builder.CreateVector<int32_t>(graph_inputs, 1),
+      builder.CreateVector<int32_t>(graph_outputs, 1),
+      builder.CreateVector(&op, 1),
+      builder.CreateString("Conv2D subgraph"));
+  flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
+  flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers, 3));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  if (interpreter == nullptr) {
+    state.SkipWithError("TFLite interpreter is null");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::WipeCache();
+    benchmark::utils::PrefetchToL1(
+      interpreter->typed_tensor<float>(0),
+      batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
+    state.ResumeTiming();
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * output_height * output_width *
+      groups * group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+// ShuffleNet v1 with 1 group.
+static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /******************* Stage 2: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   36});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  36,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  120});
+  /******************* Stage 2: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   36});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  36,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  144});
+  /******************* Stage 3: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   72});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  72,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  144});
+  /******************* Stage 3: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,   72});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  72,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  288});
+  /******************* Stage 4: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,  144});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 144,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  288});
+  /******************* Stage 4: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  144});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 144,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  576});
+}
+// ShuffleNet v1 with 2 groups.
+static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /******************* Stage 2: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   50});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  50,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,   88});
+  /******************* Stage 2: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   25});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  50,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,  100});
+  /******************* Stage 3: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   50});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 100,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  100});
+  /******************* Stage 3: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,   50});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 100,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  200});
+  /******************* Stage 4: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,  100});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 200,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  200});
+  /******************* Stage 4: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  400,  100});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 200,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  400});
+}
+// ShuffleNet v1 with 3 groups.
+static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /******************* Stage 2: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   60});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  60,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   72});
+  /******************* Stage 2: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   20});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  60,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   80});
+  /******************* Stage 3: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   40});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 120,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,   80});
+  /******************* Stage 3: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   40});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 120,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,  160});
+  /******************* Stage 4: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   80});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 240,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  160});
+  /******************* Stage 4: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,  320,   80});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 240,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  320});
+}
+// ShuffleNet v1 with 4 groups.
+static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /******************* Stage 2: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   68});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  68,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   62});
+  /******************* Stage 2: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   17});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  68,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   68});
+  /******************* Stage 3: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   34});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 136,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,   68});
+  /******************* Stage 3: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   34});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 136,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,  136});
+  /******************* Stage 4: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   68});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 272,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  136});
+  /******************* Stage 4: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,  272,   68});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 272,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  272});
+}
+// ShuffleNet v1 with 8 groups.
+static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /******************* Stage 2: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   96});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  96,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   45});
+  /******************* Stage 2: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   12});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  96,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   48});
+  /******************* Stage 3: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   24});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   48});
+  /******************* Stage 3: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   24});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 192,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   96});
+  /******************* Stage 4: stride-2 unit ******************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   48});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 384,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,   96});
+  /******************* Stage 4: stride-1 units *****************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,  192,   48});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 384,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,  192});
+}
+// ShuffleNet v2 (0.5X scale)
+static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /************************** Stage 2 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   24});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   24});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  24,    1,    1});
+  /************************** Stage 3 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  48,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,   48});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   48,   48});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  48,    1,    1});
+  /************************** Stage 4 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  96,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,   96});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   96});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1,  96,    1,    1});
+  /*************************** Conv 5 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  192, 1024});
+}
+// ShuffleNet v2 (1.0X scale)
+static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /************************** Stage 2 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   58});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   58});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  58,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   58,   58});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  58,    1,    1});
+  /************************** Stage 3 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 116,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  116,  116});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  116,  116});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 116,    1,    1});
+  /************************** Stage 4 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 232,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  232,  232});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  232,  232});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 232,    1,    1});
+  /*************************** Conv 5 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  464, 1024});
+}
+// ShuffleNet v2 (1.5X scale)
+static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /************************** Stage 2 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   88});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  88,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   88});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
+  /************************** Stage 3 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 176,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  176,  176});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  176,  176});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 176,    1,    1});
+  /************************** Stage 4 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 352,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  352,  352});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  352,  352});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 352,    1,    1});
+  /*************************** Conv 5 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  704, 1024});
+}
+// ShuffleNet v2 (2.0X scale)
+static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*************************** Conv 1 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
+  /************************** Stage 2 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,  122});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  122});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 122,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  122,  122});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 122,    1,    1});
+  /************************** Stage 3 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 244,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  244,  244});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  244,  244});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 244,    1,    1});
+  /************************** Stage 4 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 488,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  488,  488});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  488,  488});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 488,    1,    1});
+  /*************************** Conv 5 **************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  976, 2048});
+}
+static void MobileNetV1(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D    G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,    1,    3,   32});
+  b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,   32,    1,    1});
+  b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,    1,   32,   64});
+  b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,   64,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,   64,  128});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  128,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,  128,  128});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  128,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  128,  256});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  256,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  256,  256});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  256,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  256,  512});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  512,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  512,  512});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  512,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1,  512, 1024});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1024,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1, 1024, 1024});
+}
+static void MobileNetV2(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   32});
+  /************************ Bottleneck 1 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+  b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  32,    1,    1});
+  b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   32,   16});
+  /************************ Bottleneck 2 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+  b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   96});
+  b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  96,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   96,   24});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 144,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,  144,   24});
+  /************************ Bottleneck 3 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+//b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 144,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   32});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
+//b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
+  /************************ Bottleneck 4 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  192,   64});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
+//b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
+//b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
+  /************************ Bottleneck 5 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
+//b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   96});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+//b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
+  /************************ Bottleneck 6 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 576,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  160});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+//b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
+  /************************ Bottleneck 7 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+//b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  320});
+  /******************** Pre-pooling Conv2D *********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  320, 1280});
+  /******************** Post-pooling Conv2D ********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1000});
+}
+static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*********************** Initial Stage ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
+  /*********************** Bottleneck 1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  16,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   16,    8});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,    8,   16});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   16});
+  /*********************** Bottleneck 2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   72});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  72,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   24});
+  /*********************** Bottleneck 3 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   24});
+  /*********************** Bottleneck 4 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   96});
+  b->Args({1,  28,  28,  5,  5,  4,  4, 2, 1,  96,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   96,   24});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   96});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   40});
+  /*********************** Bottleneck 5 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
+  b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
+  /*********************** Bottleneck 6 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
+//b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
+  /*********************** Bottleneck 7 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  120});
+  b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 120,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  120,   48});
+  /*********************** Bottleneck 8 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  144});
+  b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 144,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,   40});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   40,  144});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  144,   48});
+  /*********************** Bottleneck 9 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  288});
+  b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 288,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  288,   72});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,  288});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  288,   96});
+  /*********************** Bottleneck 10 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+  b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
+  /*********************** Bottleneck 11 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+//b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
+  /************************ Last Stage  ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576, 1024});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1024, 1001});
+}
+static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*********************** Initial Stage ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
+  /*********************** Bottleneck 1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  16,    1,    1});
+  b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   16});
+  /*********************** Bottleneck 2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   64});
+  b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  64,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   64,   24});
+  /*********************** Bottleneck 3 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  72,    1,    1});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   72,   24});
+  /*********************** Bottleneck 4 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
+  b->Args({1,  56,  56,  5,  5,  4,  4, 2, 1,  72,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,   24});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   72});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   40});
+  /*********************** Bottleneck 5 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
+  b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
+  /*********************** Bottleneck 6 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
+//b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
+  /*********************** Bottleneck 7 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  240});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 240,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   80});
+  /*********************** Bottleneck 8 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  200});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 200,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  200,   80});
+  /*********************** Bottleneck 9 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
+  /********************** Bottleneck 10 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
+//b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
+  /********************** Bottleneck 11 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  480});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 480,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  480,  120});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,  480});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  480,  112});
+  /********************** Bottleneck 12 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 672,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  672,  168});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  168,  672});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  672,  112});
+  /********************** Bottleneck 13 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
+  b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 672,    1,    1});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  672,  160});
+  /********************** Bottleneck 14 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+  b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
+  /********************** Bottleneck 15 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+//b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
+//b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
+  /************************ Last Stage  ***********************/
+  /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960, 1280});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1001});
+}
+// SqueezeNet 1.0
+static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /************************** Conv 1 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   96});
+  /************************** Fire 2 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   96,   16});
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
+  b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
+  /************************** Fire 3 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  56,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
+//b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
+//b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
+  /************************** Fire 4 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   32});
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   32,  128});
+  b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   32,  128});
+  /************************** Fire 5 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
+  b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
+  /************************** Fire 6 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   48});
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
+  b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
+  /************************** Fire 7 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   48});
+//b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
+//b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
+  /************************** Fire 8 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   64});
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+  b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   64,  256});
+  /************************** Fire 9 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+  b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
+  /************************* Conv 10 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
+}
+// SqueezeNet 1.1
+static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /************************** Conv 1 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1, 1,    3,   64});
+  /************************** Fire 2 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   64,   16});
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
+  b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
+  /************************** Fire 3 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
+//b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
+//b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
+  /************************** Fire 4 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  128,   32});
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
+  b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
+  /************************** Fire 5 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
+//b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
+//b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
+  /************************** Fire 6 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  256,   48});
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
+  b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
+  /************************** Fire 7 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   48});
+//b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
+//b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
+  /************************** Fire 8 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   64});
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+  b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
+  /************************** Fire 9 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
+//b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+//b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
+  /************************* Conv 10 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
+}
+static void InceptionV3(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 299, 299,  3,  3,  0,  0, 2, 1, 1,    3,   32});
+  b->Args({1, 149, 149,  3,  3,  0,  0, 1, 1, 1,   32,   32});
+  b->Args({1, 147, 147,  3,  3,  2,  2, 1, 1, 1,   32,   64});
+  b->Args({1,  73,  73,  1,  1,  0,  0, 1, 1, 1,   64,   80});
+  b->Args({1,  73,  73,  3,  3,  0,  0, 1, 1, 1,   80,  192});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   64});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   48});
+  b->Args({1,  35,  35,  5,  5,  4,  4, 1, 1, 1,   48,   64});
+  b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   64,   96});
+  b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   96,   96});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   32});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   64});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   48});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   64});
+  b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   48});
+  b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,  288,  384});
+  b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,   96,   96});
+  b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  192});
+  b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  128});
+  b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  128});
+  b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  192});
+  b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  128});
+  b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  192});
+  b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  160});
+  b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  160});
+  b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  192});
+  b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  160});
+  b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  192});
+  b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  192,  192});
+  b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  192,  192});
+  b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  320});
+  b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  192});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  320});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  384});
+  b->Args({1,   8,   8,  1,  3,  0,  2, 1, 1, 1,  384,  384});
+  b->Args({1,   8,   8,  3,  1,  2,  0, 1, 1, 1,  384,  384});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  448});
+  b->Args({1,   8,   8,  3,  3,  2,  2, 1, 1, 1,  448,  384});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  192});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  320});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  384});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  448});
+  b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  192});
+  b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1, 1, 2048, 1001});
+}
+static void ResNet18(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /************************* Conv 1 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
+  /************************ Conv 2.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
+  /************************ Conv 3.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,   64,  128});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,   64,  128});
+  /************************ Conv 4.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  128,  256});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  128,  256});
+  /************************ Conv 5.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  256,  512});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1,  256,  512});
+}
+static void ResNet50(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /************************* Conv 1 *************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
+  /************************ Conv 2.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,   64});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+//b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+  /************************ Conv 2.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,   64});
+//b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
+//b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
+  /************************ Conv 3.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  128});
+  b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,  128,  128});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
+  b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,  256,  512});
+  /************************ Conv 3.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  128});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
+//b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
+  /************************ Conv 4.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  256});
+  b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  256,  256});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
+  b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  512, 1024});
+  /************************ Conv 4.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  256});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
+//b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
+  /************************ Conv 5.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  512});
+  b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  512,  512});
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
+  b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1, 1024, 2048});
+  /************************ Conv 5.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
+  b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1, 2048,  512});
+  b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
+//b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
+}
+static void VGG(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /************************* Conv 1.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,    3,   64});
+  /************************* Conv 1.2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,   64,   64});
+  /************************* Conv 2.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,   64,  128});
+  /************************* Conv 2.2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,  128,  128});
+  /************************* Conv 3.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  128,  256});
+  /************************* Conv 3.2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  256,  256});
+  /************************* Conv 3.3 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  256});
+  /************************* Conv 4.1 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  256,  512});
+  /************************* Conv 4.2 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  512,  512});
+  /************************* Conv 4.3 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  512});
+  /************************* Conv 5.X ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  512,  512});
+  /************************* Conv 5.3 ************************/
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  512,  512});
+}
+// SRCNN (9-1-5)
+static void SRCNN915(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
+  b->Args({1, 376, 376,  1,  1,  0,  0, 1, 1, 1,   64,   32});
+  b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   32,    1});
+}
+// SRCNN (9-3-5)
+static void SRCNN935(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
+  b->Args({1, 376, 376,  3,  3,  0,  0, 1, 1, 1,   64,   32});
+  b->Args({1, 374, 374,  5,  5,  0,  0, 1, 1, 1,   32,    1});
+}
+// SRCNN (9-5-5)
+static void SRCNN955(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
+  /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
+  b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
+  b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   64,   32});
+  b->Args({1, 372, 372,  5,  5,  0,  0, 1, 1, 1,   32,    1});
+}
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/cs16-bfly4.cc ADDED Viewed

	@@ -0,0 +1,116 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/fft.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+void cs16_bfly4(
+    benchmark::State& state,
+    xnn_cs16_bfly4_ukernel_fn bfly4,
+    benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if ((isa_check != nullptr) && !isa_check(state)) {
+    return;
+  }
+  const size_t fft_size = state.range(0);
+  const size_t batch = state.range(1);
+  const size_t samples = state.range(2);
+  const size_t stride = state.range(3);
+  assert(fft_size == samples * stride * 4);  // 4 for bfly4.
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> output(fft_size * 2);
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(fft_size * 3 / 4 * 2);
+  std::iota(output.begin(), output.end(), 0);
+  std::iota(twiddle.begin(), twiddle.end(), 0);
+  for (auto _ : state) {
+    bfly4(batch, samples * sizeof(int16_t) * 2, output.data(), twiddle.data(), stride * sizeof(int16_t) * 2);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"fft_size", "batch", "samples", "stride"});
+  b->Args({256, 1, 1, 64});
+  b->Args({256, 4, 1, 64});
+  b->Args({256, 1, 4, 16});
+  b->Args({256, 4, 4, 16});
+  b->Args({256, 1, 16, 4});
+  b->Args({256, 4, 16, 4});
+  b->Args({256, 1, 64, 1});
+}
+static void BenchmarkSamples1KernelSize(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"fft_size", "batch", "samples", "stride"});
+  b->Args({256, 1, 1, 64});
+  b->Args({256, 4, 1, 64});
+  b->Args({256, 16, 1, 64});
+  b->Args({256, 64, 1, 64});
+}
+static void BenchmarkSamples4KernelSize(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"fft_size", "batch", "samples", "stride"});
+  b->Args({256, 1, 4, 16});
+  b->Args({256, 4, 4, 16});
+  b->Args({256, 16, 4, 16});
+}
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x1, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x1)
+  ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x2, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x2)
+  ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, samples1__asm_aarch32_neon_x4, xnn_cs16_bfly4_samples1_ukernel__asm_aarch32_neon_x4)
+  ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(cs16_bfly4, samples1__neon, xnn_cs16_bfly4_samples1_ukernel__neon)
+  ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, samples4__neon, xnn_cs16_bfly4_samples4_ukernel__neon)
+  ->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, neon_x1, xnn_cs16_bfly4_ukernel__neon_x1)
+  ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, neon_x4, xnn_cs16_bfly4_ukernel__neon_x4)
+  ->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(cs16_bfly4, samples1__scalar, xnn_cs16_bfly4_samples1_ukernel__scalar)
+  ->Apply(BenchmarkSamples1KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, samples4__scalar, xnn_cs16_bfly4_samples4_ukernel__scalar)
+  ->Apply(BenchmarkSamples4KernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, scalar_x1, xnn_cs16_bfly4_ukernel__scalar_x1)
+  ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, scalar_x2, xnn_cs16_bfly4_ukernel__scalar_x2)
+  ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_bfly4, scalar_x4, xnn_cs16_bfly4_ukernel__scalar_x4)
+  ->Apply(BenchmarkKernelSize)->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/cs16-fftr.cc ADDED Viewed

	@@ -0,0 +1,73 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/fft.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+void cs16_fftr(
+    benchmark::State& state,
+    xnn_cs16_fftr_ukernel_fn fftr,
+    benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if ((isa_check != nullptr) && !isa_check(state)) {
+    return;
+  }
+  const size_t samples = state.range(0);
+  assert(samples % 2 == 0);
+  const size_t sample_size = samples * 2 + 2;
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> data(sample_size + XNN_EXTRA_BYTES / sizeof(int16_t));
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> twiddle(samples);
+  std::iota(data.begin(), data.end(), 0);
+  std::iota(twiddle.begin(), twiddle.end(), 2);
+  for (auto _ : state) {
+    fftr(samples, data.data(), twiddle.data());
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"samples"});
+  b->Args({256});
+  b->Args({1024});
+}
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x1, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_fftr, cs16_aarch32_neon_x4, xnn_cs16_fftr_ukernel__asm_aarch32_neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(cs16_fftr, cs16_neon_x4, xnn_cs16_fftr_ukernel__neon_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x1, xnn_cs16_fftr_ukernel__scalar_x1)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x2, xnn_cs16_fftr_ukernel__scalar_x2)->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(cs16_fftr, cs16_scalar_x4, xnn_cs16_fftr_ukernel__scalar_x4)->Apply(BenchmarkKernelSize)->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/cs16-vsquareabs.cc ADDED Viewed

	@@ -0,0 +1,127 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vsquareabs.h>
+void cs16_vsquareabs(
+    benchmark::State& state,
+    xnn_cs16_vsquareabs_ukernel_fn vsquareabs,
+    benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if ((isa_check != nullptr) && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(
+      num_elements * 2 + XNN_EXTRA_BYTES / sizeof(int16_t));
+  std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> output(num_elements);
+  std::iota(input.begin(), input.end(), 0);
+  std::iota(output.begin(), output.end(), 0);
+  for (auto _ : state) {
+    vsquareabs(num_elements * sizeof(int16_t) * 2, input.data(), output.data());
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = num_elements * (sizeof(std::complex<int16_t>) + sizeof(uint32_t));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x4,
+                    xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x4,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x8,
+                    xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x8,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x12,
+                    xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x12,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_neon_x16,
+                    xnn_cs16_vsquareabs_ukernel__neon_mlal_ld128_x16,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_HEXAGON
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x2,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x4,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x4)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x6,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x6)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x8,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x10,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x10)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_hexagon_x12,
+                    xnn_cs16_vsquareabs_ukernel__hexagon_x12)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_HEXAGON
+BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x1,
+                  xnn_cs16_vsquareabs_ukernel__scalar_x1)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x2,
+                  xnn_cs16_vsquareabs_ukernel__scalar_x2)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x3,
+                  xnn_cs16_vsquareabs_ukernel__scalar_x3)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(cs16_vsquareabs, cs16_scalar_x4,
+                  xnn_cs16_vsquareabs_ukernel__scalar_x4)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<std::complex<int16_t>, uint32_t>)
+  ->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/dconv.h ADDED Viewed

	@@ -0,0 +1,54 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <benchmark/benchmark.h>
+#define BENCHMARK_DCONV(conv_fn) \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v1, "MobileNet v1/v2")->Apply(MobileNetConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, mobilenet_v3, "MobileNet v3")->Apply(MobileNetV3ConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, shufflenet, "ShuffleNet v1/v2")->Apply(ShuffleNetConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(conv_fn, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11ConvArguments)->UseRealTime();
+// ShuffleNet v1/v2.
+static void ShuffleNetConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "Cout"});
+  /********* Conv 1 ********/
+  /*        H    W   GCout */
+  b->Args({224, 224,   24});
+}
+// MobileNet v1/v2.
+static void MobileNetConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "Cout"});
+  /*        H    W   GCout */
+  b->Args({224, 224,   32});
+}
+// MobileNet v3 Small/Large.
+static void MobileNetV3ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "Cout"});
+  /******************* Initial Stage *******************/
+  /*        H    W   GCout */
+  b->Args({224, 224,   16});
+}
+// SqueezeNet 1.1
+static void SqueezeNetV11ConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "GCout"});
+  /*********************** Conv 1 **********************/
+  /*        H    W   GCout */
+  b->Args({224, 224,   64});
+}

bench/deconvolution.cc ADDED Viewed

	@@ -0,0 +1,575 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include <xnnpack.h>
+#include <benchmark/benchmark.h>
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE */
+#include "bench/utils.h"
+void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t adjustment = state.range(7);
+  const size_t stride_height = state.range(8);
+  const size_t stride_width = state.range(9);
+  const size_t dilation = state.range(10);
+  const size_t input_channels = state.range(11);
+  const size_t output_channels = state.range(12);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
+  const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
+  std::vector<uint8_t> input(batch_size * input_height * input_width * input_channels);
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::vector<uint8_t> kernel(output_channels * kernel_height * kernel_width * input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
+  std::vector<int32_t> bias(output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(i32rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_channels;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(float) * (kernel.size() + bias.size() + output_elements));
+  std::vector<uint8_t> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
+  for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
+    status = xnn_create_deconvolution2d_nhwc_qu8(
+        padding_top, padding_right, padding_bottom, padding_left,
+        kernel_height, kernel_width,
+        stride_height, stride_width,
+        dilation, dilation,
+        /*groups=*/1, input_channels, output_channels,
+        /*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
+        127, 0.5f, 127, 0.5f,
+        kernel.data(), bias.data(),
+        127, 0.5f, 0, 255,
+        0 /* flags */,
+        nullptr, nullptr,
+        &deconvolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < deconvolution_operators.size(); i++) {
+    status = xnn_reshape_deconvolution2d_nhwc_qu8(
+        deconvolution_operators[i],
+        batch_size, input_height, input_width,
+        0 /* height adjustment */, 0 /* width adjustment */,
+	/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+        /*threadpool=*/nullptr);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < deconvolution_operators.size(); i++) {
+    status = xnn_setup_deconvolution2d_nhwc_qu8(
+        deconvolution_operators[i],
+        input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
+    status = xnn_delete_operator(deconvolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete QINT8 Deconvolution operator");
+      return;
+    }
+    deconvolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["OPS"] = benchmark::Counter(
+  uint64_t(state.iterations()) * 2 *
+    batch_size * input_width * input_width *
+    input_channels * output_channels *
+    kernel_height * kernel_width,
+  benchmark::Counter::kIsRate);
+}
+void xnnpack_deconvolution_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t adjustment = state.range(7);
+  const size_t stride_height = state.range(8);
+  const size_t stride_width = state.range(9);
+  const size_t dilation = state.range(10);
+  const size_t input_channels = state.range(11);
+  const size_t output_channels = state.range(12);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t padding_right = padding_width - padding_left;
+  const size_t padding_bottom = padding_height - padding_top;
+  const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + effective_kernel_height, padding_height) - padding_height;
+  const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;
+  std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+    batch_size * input_height * input_width * input_channels);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
+  std::vector<float> bias(output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f32rng));
+  const size_t output_elements = batch_size * output_height * output_width * output_channels;
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(float) * (kernel.size() + bias.size() + output_elements));
+  std::vector<float> output(output_elements * num_buffers);
+  std::vector<xnn_operator_t> deconvolution_operators(num_buffers);
+  for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
+    status = xnn_create_deconvolution2d_nhwc_f32(
+        padding_top, padding_right, padding_bottom, padding_left,
+        kernel_height, kernel_width,
+        stride_height, stride_width,
+        dilation, dilation,
+        /*groups=*/1, input_channels, output_channels,
+        /*input_pixel_stride=*/input_channels, /*output_pixel_stride=*/output_channels,
+        kernel.data(), bias.data(),
+        -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
+        0 /* flags */,
+        nullptr,
+        nullptr,
+        &deconvolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to create FP32 Deconvolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < deconvolution_operators.size(); i++) {
+    status = xnn_reshape_deconvolution2d_nhwc_f32(
+        deconvolution_operators[i],
+        batch_size, input_height, input_width,
+        0 /* height adjustment */, 0 /* width adjustment */,
+	/*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+        /*threadpool=*/nullptr);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  for (size_t i = 0; i < deconvolution_operators.size(); i++) {
+    status = xnn_setup_deconvolution2d_nhwc_f32(
+        deconvolution_operators[i],
+        input.data(), output.data() + i * output_elements);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to setup QINT8 Deconvolution operator");
+      return;
+    }
+  }
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    status = xnn_run_operator(deconvolution_operators[buffer_index], nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run FP32 Deconvolution operator");
+      return;
+    }
+  }
+  for (xnn_operator_t& deconvolution_op : deconvolution_operators) {
+    status = xnn_delete_operator(deconvolution_op);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to delete FP32 Deconvolution operator");
+      return;
+    }
+    deconvolution_op = nullptr;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * input_width * input_width *
+      input_channels * output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+void tflite_deconvolution_f32(benchmark::State& state, const char* net) {
+  const size_t batch_size = state.range(0);
+  const size_t input_height = state.range(1);
+  const size_t input_width = state.range(2);
+  const size_t kernel_height = state.range(3);
+  const size_t kernel_width = state.range(4);
+  const size_t padding_height = state.range(5);
+  const size_t padding_width = state.range(6);
+  const size_t adjustment = state.range(7);
+  const size_t stride_height = state.range(8);
+  const size_t stride_width = state.range(9);
+  const size_t dilation = state.range(10);
+  const size_t input_channels = state.range(11);
+  const size_t output_channels = state.range(12);
+  if (dilation != 1) {
+    state.SkipWithError("dilated deconvolution is not supported");
+    return;
+  }
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  tflite::Padding tf_padding = tflite::Padding_VALID;
+  if (padding_width == kernel_width - stride_width && padding_height == kernel_height - stride_height) {
+    tf_padding = tflite::Padding_SAME;
+  } else if (padding_width == 0 && padding_height == 0) {
+    tf_padding = tflite::Padding_VALID;
+  } else {
+    state.SkipWithError("unsupported padding");
+    return;
+  }
+  const size_t output_height = std::max(stride_height * (input_height - 1) + adjustment + kernel_height, padding_height) - padding_height;
+  const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + kernel_width, padding_width) - padding_width;
+  std::vector<float> kernel(output_channels * kernel_height * kernel_width * input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_TRANSPOSE_CONV, 0);
+  flatbuffers::Offset<tflite::TransposeConvOptions> transpose_conv_options = CreateTransposeConvOptions(
+      builder,
+      tf_padding,
+      static_cast<int32_t>(stride_width), static_cast<int32_t>(stride_height));
+  const std::array<int32_t, 4> input_shape{{
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(input_height),
+    static_cast<int32_t>(input_width),
+    static_cast<int32_t>(input_channels)
+  }};
+  const std::array<int32_t, 4> output_shape{{
+    static_cast<int32_t>(batch_size),
+    static_cast<int32_t>(output_height),
+    static_cast<int32_t>(output_width),
+    static_cast<int32_t>(output_channels)
+  }};
+  const std::array<int32_t, 4> filter_shape{{
+    static_cast<int32_t>(output_channels),
+    static_cast<int32_t>(kernel_height),
+    static_cast<int32_t>(kernel_width),
+    static_cast<int32_t>(input_channels)
+  }};
+  const std::array<int32_t, 1> output_shape_shape{{ 4 }};
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+    tflite::CreateBuffer(builder, builder.CreateVector(
+      reinterpret_cast<const uint8_t*>(kernel.data()),
+      sizeof(float) * kernel.size())),
+    tflite::CreateBuffer(builder, builder.CreateVector(
+      reinterpret_cast<const uint8_t*>(output_shape.data()),
+      sizeof(int32_t) * output_shape.size())),
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(output_shape_shape.data(), output_shape_shape.size()),
+                         tflite::TensorType_INT32,
+                         2 /* buffer id */),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+                         tflite::TensorType_FLOAT32,
+                         1 /* buffer id */),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+                         tflite::TensorType_FLOAT32),
+  }};
+  const std::array<int32_t, 3> op_inputs{{ 0, 1, 2 }};
+  const std::array<int32_t, 1> op_outputs{{ 3 }};
+  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      tflite::BuiltinOptions_TransposeConvOptions,
+      transpose_conv_options.Union());
+  const std::array<int32_t, 1> graph_inputs{{ 2 }};
+  const std::array<int32_t, 1> graph_outputs{{ 3 }};
+  flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1),
+      builder.CreateString("TransposeConv subgraph"));
+  const flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("TransposeConv model");
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  if (interpreter == nullptr) {
+    state.SkipWithError("TFLite interpreter is null");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(2),
+    interpreter->typed_tensor<float>(2) + batch_size * input_channels * input_height * input_width,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::WipeCache();
+    benchmark::utils::PrefetchToL1(
+      interpreter->typed_tensor<float>(2),
+      batch_size * input_channels * input_height * input_width * sizeof(float));
+    state.ResumeTiming();
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      batch_size * input_width * input_width *
+      input_channels * output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+// FCN-32 model (PASCAL VOC version).
+// We assume CIF image (352x288) on model input / output.
+static void FCN32(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
+  /*       N  H   W  KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1, 9, 11, 64, 64,  0,  0, 0, 32, 32, 1,  21,  21});
+}
+// FCN-16 model (PASCAL VOC version).
+// We assume CIF image (352x288) on model input / output.
+static void FCN16(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
+  /*       N   H   W  KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1,  9, 11,  4,  4,  0,  0, 0,  2,  2, 1,  21,  21});
+  b->Args({1, 18, 22, 32, 32,  0,  0, 0, 16, 16, 1,  21,  21});
+}
+// FCN-8 model (PASCAL VOC version).
+// We assume CIF image (352x288) on model input / output.
+static void FCN8(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
+  /*       N   H   W  KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1,  9, 11,  4,  4,  0,  0, 0,  2,  2, 1,  21,  21});
+  b->Args({1, 18, 22,  4,  4,  0,  0, 0,  2,  2, 1,  21,  21});
+  b->Args({1, 36, 44, 16, 16,  0,  0, 0,  8,  8, 1,  21,  21});
+}
+static void ENet(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
+  /*********************** Bottleneck 4.0 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1,  64,  64,  3,  3,  2,  2, 1,  2,  2, 1,  32,  32});
+  /*********************** Bottleneck 5.0 ***********************/
+  /*       N   H    W   KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1, 128, 128,  3,  3,  2,  2, 1,  2,  2, 1,  16,  16});
+  /******************* Final Full Convolution *******************/
+  /*       N   H    W   KH  KW  PH  PW  A  SH  SH  D  Cin  Cout */
+  b->Args({1, 256, 256,  2,  2,  0,  0, 0,  2,  2, 1,  16,  12});
+}
+static void ESPNet(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "A", "SH", "SW", "D", "Cin", "Cout"});
+  /*       N   H    W   KH  KW  PH  PW  A  SH  SW  D  Cin  Cout */
+  b->Args({1,  64, 128,  2,  2,  0,  0, 0,  2,  2, 1,  20,  20});
+  b->Args({1, 128, 256,  2,  2,  0,  0, 0,  2,  2, 1,  20,  20});
+  b->Args({1, 256, 512,  2,  2,  0,  0, 0,  2,  2, 1,  20,  20});
+}
+BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn32, "FCN-32")
+  ->Apply(FCN32)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn16, "FCN-16")
+  ->Apply(FCN16)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, fcn8, "FCN-8")
+  ->Apply(FCN8)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, enet, "ENet")
+  ->Apply(ENet)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_f32, espnet, "ESPNet")
+  ->Apply(ESPNet)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn32, "FCN-32")
+  ->Apply(FCN32)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn16, "FCN-16")
+  ->Apply(FCN16)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, fcn8, "FCN-8")
+  ->Apply(FCN8)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, enet, "ENet")
+  ->Apply(ENet)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(xnnpack_deconvolution_qu8, espnet, "ESPNet")
+  ->Apply(ESPNet)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn32, "FCN-32")
+    ->Apply(FCN32)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn16, "FCN-16")
+    ->Apply(FCN16)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_deconvolution_f32, fcn8, "FCN-8")
+    ->Apply(FCN8)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_deconvolution_f32, enet, "ENet")
+    ->Apply(ENet)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(tflite_deconvolution_f32, espnet, "ESPNet")
+    ->Apply(ESPNet)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/dwconv.h ADDED Viewed

	@@ -0,0 +1,368 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <benchmark/benchmark.h>
+#define BENCHMARK_DWCONV(dwconv_fn) \
+  BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3SmallDWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3LargeDWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15DWConvArguments)->UseRealTime(); \
+  BENCHMARK_CAPTURE(dwconv_fn, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20DWConvArguments)->UseRealTime();
+// ShuffleNet v1 with 1 group.
+static void ShuffleNetV1G1DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /********* Stage 2: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  36});
+  /********* Stage 2: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  36});
+  /********* Stage 3: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  72});
+  /********* Stage 3: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1,  72});
+  /********* Stage 4: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 144});
+  /********* Stage 4: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({ 7,  7,  3,  3,  2,  2, 2, 1, 144});
+}
+// ShuffleNet v1 with 2 groups.
+static void ShuffleNetV1G2DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /********* Stage 2: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  50});
+  /********* Stage 2: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  50});
+  /********* Stage 3: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 100});
+  /********* Stage 3: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 100});
+  /********* Stage 4: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 200});
+  /********* Stage 4: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 7,  7,  3,  3,  2,  2, 2, 1, 200});
+}
+// ShuffleNet v1 with 3 groups.
+static void ShuffleNetV1G3DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /********* Stage 2: stride-2 unit **********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  60});
+  /********* Stage 2: stride-1 units *********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  60});
+  /********* Stage 3: stride-2 unit **********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 120});
+  /********* Stage 3: stride-1 units *********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 120});
+  /********* Stage 4: stride-2 unit **********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 240});
+  /********* Stage 4: stride-1 units *********/
+  /*        H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 7,  7,  3,  3,  2,  2, 2, 1, 240});
+}
+// ShuffleNet v1 with 4 groups.
+static void ShuffleNetV1G4DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /********* Stage 2: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  68});
+  /********* Stage 2: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  68});
+  /********* Stage 3: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 136});
+  /********* Stage 3: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 136});
+  /********* Stage 4: stride-2 unit *********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 272});
+  /********* Stage 4: stride-1 units ********/
+  /*       H   W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 7,  7,  3,  3,  2,  2, 2, 1, 272});
+}
+// ShuffleNet v1 with 8 groups.
+static void ShuffleNetV1G8DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /********* Stage 2: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  96});
+  /********* Stage 2: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1,  96});
+  /********* Stage 3: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 192});
+  /********* Stage 3: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 192});
+  /********* Stage 4: stride-2 unit *********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 384});
+  /********* Stage 4: stride-1 units ********/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({ 7,  7,  3,  3,  2,  2, 2, 1, 384});
+}
+// ShuffleNet v2 (0.5X scale)
+static void ShuffleNetV2X05DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /**************** Stage 2 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1, 24});
+  b->Args({28, 28,  3,  3,  2,  2, 1, 1, 24});
+  /**************** Stage 3 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 48});
+  b->Args({14, 14,  3,  3,  2,  2, 1, 1, 48});
+  /**************** Stage 4 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 96});
+  b->Args({ 7,  7,  3,  3,  2,  2, 1, 1, 96});
+}
+// ShuffleNet v2 (1.0X scale)
+static void ShuffleNetV2X10DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /**************** Stage 2 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  24});
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  58});
+  b->Args({28, 28,  3,  3,  2,  2, 1, 1,  58});
+  /**************** Stage 3 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 116});
+  b->Args({14, 14,  3,  3,  2,  2, 1, 1, 116});
+  /**************** Stage 4 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 232});
+  b->Args({ 7,  7,  3,  3,  2,  2, 1, 1, 232});
+}
+// ShuffleNet v2 (1.5X scale)
+static void ShuffleNetV2X15DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /**************** Stage 2 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  24});
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  88});
+  b->Args({28, 28,  3,  3,  2,  2, 1, 1,  88});
+  /**************** Stage 3 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 176});
+  b->Args({14, 14,  3,  3,  2,  2, 1, 1, 176});
+  /**************** Stage 4 *****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 352});
+  b->Args({ 7,  7,  3,  3,  2,  2, 1, 1, 352});
+}
+// ShuffleNet v2 (2.0X scale)
+static void ShuffleNetV2X20DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /***************** Stage 2 ****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1,  24});
+  b->Args({56, 56,  3,  3,  2,  2, 2, 1, 122});
+  b->Args({28, 28,  3,  3,  2,  2, 1, 1, 122});
+  /***************** Stage 3 ****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({28, 28,  3,  3,  2,  2, 2, 1, 244});
+  b->Args({14, 14,  3,  3,  2,  2, 1, 1, 244});
+  /***************** Stage 4 ****************/
+  /*        H   W  KH  KW  PH  PW  S  D   G */
+  b->Args({14, 14,  3,  3,  2,  2, 2, 1, 488});
+  b->Args({ 7,  7,  3,  3,  2,  2, 1, 1, 488});
+}
+static void MobileNetV1DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({112, 112,  3,  3,  2,  2, 1, 1,   32});
+  b->Args({112, 112,  3,  3,  2,  2, 2, 1,   64});
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,  128});
+  b->Args({ 56,  56,  3,  3,  2,  2, 2, 1,  128});
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  256});
+  b->Args({ 28,  28,  3,  3,  2,  2, 2, 1,  256});
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1,  512});
+  b->Args({ 14,  14,  3,  3,  2,  2, 2, 1,  512});
+  b->Args({  7,   7,  3,  3,  2,  2, 1, 1, 1024});
+}
+static void MobileNetV2DWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /**************** Bottleneck 1 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({112, 112,  3,  3,  2,  2, 1, 1,  32});
+  /**************** Bottleneck 2 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({112, 112,  3,  3,  2,  2, 2, 1,  96});
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1, 144});
+  /**************** Bottleneck 3 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({ 56,  56,  3,  3,  2,  2, 2, 1, 144});
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1, 192});
+//b->Args({ 28,  28,  3,  3,  2,  2, 1, 1, 192});
+  /**************** Bottleneck 4 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({ 28,  28,  3,  3,  2,  2, 2, 1, 192});
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 384});
+//b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 384});
+//b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 384});
+  /**************** Bottleneck 5 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+//b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 384});
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 576});
+//b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 576});
+  /**************** Bottleneck 6 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+  b->Args({ 14,  14,  3,  3,  2,  2, 2, 1, 576});
+  b->Args({  7,   7,  3,  3,  2,  2, 1, 1, 960});
+//b->Args({  7,   7,  3,  3,  2,  2, 1, 1, 960});
+  /**************** Bottleneck 7 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D    G */
+//b->Args({  7,   7,  3,  3,  2,  2, 1, 1, 960});
+}
+static void MobileNetV3SmallDWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /*************** Bottleneck 1 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({112, 112,  3,  3,  2,  2, 2, 1,  16});
+  /*************** Bottleneck 2 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 56,  56,  3,  3,  2,  2, 2, 1,  72});
+  /*************** Bottleneck 3 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 28,  28,  3,  3,  2,  2, 1, 1,  88});
+  /*************** Bottleneck 4 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 28,  28,  5,  5,  4,  4, 2, 1,  96});
+  /*************** Bottleneck 5 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  5,  5,  4,  4, 1, 1, 240});
+  /*************** Bottleneck 6 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+//b->Args({ 14,  14,  5,  5,  4,  4, 1, 1, 240});
+  /*************** Bottleneck 7 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  5,  5,  4,  4, 1, 1, 120});
+  /*************** Bottleneck 8 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  5,  5,  4,  4, 1, 1, 144});
+  /*************** Bottleneck 9 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  5,  5,  4,  4, 2, 1, 288});
+  /*************** Bottleneck 10 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({  7,   7,  5,  5,  4,  4, 1, 1, 576});
+  /*************** Bottleneck 11 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+//b->Args({  7,   7,  5,  5,  4,  4, 1, 1, 576});
+}
+static void MobileNetV3LargeDWConvArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"H", "W", "KH", "KW", "PH", "PW", "S", "D", "G"});
+  /*************** Bottleneck 1 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({112, 112,  3,  3,  2,  2, 1, 1,  16});
+  /*************** Bottleneck 2 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({112, 112,  3,  3,  2,  2, 2, 1,  64});
+  /*************** Bottleneck 3 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 56,  56,  3,  3,  2,  2, 1, 1,  72});
+  /*************** Bottleneck 4 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 56,  56,  5,  5,  4,  4, 2, 1,  72});
+  /*************** Bottleneck 5 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 28,  28,  5,  5,  4,  4, 1, 1, 120});
+  /*************** Bottleneck 6 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+//b->Args({ 28,  28,  5,  5,  4,  4, 1, 1, 120});
+  /*************** Bottleneck 7 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 28,  28,  3,  3,  2,  2, 2, 1, 240});
+  /*************** Bottleneck 8 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 200});
+  /*************** Bottleneck 9 ***************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 184});
+  /*************** Bottleneck 10 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+//b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 184});
+  /*************** Bottleneck 11 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 480});
+  /*************** Bottleneck 12 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  3,  3,  2,  2, 1, 1, 672});
+  /*************** Bottleneck 13 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({ 14,  14,  5,  5,  4,  4, 2, 1, 672});
+  /*************** Bottleneck 14 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+  b->Args({  7,   7,  5,  5,  4,  4, 1, 1, 960});
+  /*************** Bottleneck 15 **************/
+  /*        H    W   KH  KW  PH  PW  S  D   G */
+//b->Args({  7,   7,  5,  5,  4,  4, 1, 1, 960});
+}

bench/elu.cc ADDED Viewed

	@@ -0,0 +1,460 @@

+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+#include <xnnpack.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#ifdef BENCHMARK_TENSORFLOW_LITE
+#include "flatbuffers/include/flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+#endif  // BENCHMARK_TENSORFLOW_LITE
+static void xnnpack_elu_f16(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<uint16_t> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t elu_op = nullptr;
+  status = xnn_create_elu_nc_f16(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f /* alpha */, 0 /* flags */, &elu_op);
+  if (status != xnn_status_success || elu_op == nullptr) {
+    state.SkipWithError("failed to create ELU operator");
+    return;
+  }
+  status = xnn_reshape_elu_nc_f16(elu_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape ELU operator");
+    return;
+  }
+  status = xnn_setup_elu_nc_f16(elu_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup ELU operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(elu_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run ELU operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(elu_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete ELU operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void xnnpack_elu_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
+  std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
+  std::vector<float> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(f32rng));
+  std::fill(output.begin(), output.end(), std::nanf(""));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t elu_op = nullptr;
+  status = xnn_create_elu_nc_f32(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f /* alpha */, 0 /* flags */, &elu_op);
+  if (status != xnn_status_success || elu_op == nullptr) {
+    state.SkipWithError("failed to create ELU operator");
+    return;
+  }
+  status = xnn_reshape_elu_nc_f32(elu_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape ELU operator");
+    return;
+  }
+  status = xnn_setup_elu_nc_f32(elu_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup ELU operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(elu_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run ELU operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(elu_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete ELU operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+static void xnnpack_elu_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
+  std::vector<int8_t> output(batch_size);
+  std::generate(input.begin(), input.end(), std::ref(i8rng));
+  std::fill(output.begin(), output.end(), INT8_C(0xA5));
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  xnn_operator_t elu_op = nullptr;
+  status = xnn_create_elu_nc_qs8(
+    1 /* channels */, 1 /* input stride */, 1 /* output stride */,
+    1.0f /* alpha */,
+    0 /* input zero point */, 1.0f /* input scale */,
+    0 /* output zero point */, 1.0f /* output scale */,
+    std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+    0 /* flags */, &elu_op);
+  if (status != xnn_status_success || elu_op == nullptr) {
+    state.SkipWithError("failed to create ELU operator");
+    return;
+  }
+  status = xnn_reshape_elu_nc_qs8(elu_op, batch_size, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to reshape ELU operator");
+    return;
+  }
+  status = xnn_setup_elu_nc_qs8(elu_op, input.data(), output.data());
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to setup ELU operator");
+    return;
+  }
+  for (auto _ : state) {
+    status = xnn_run_operator(elu_op, nullptr /* thread pool */);
+    if (status != xnn_status_success) {
+      state.SkipWithError("failed to run ELU operator");
+      return;
+    }
+  }
+  status = xnn_delete_operator(elu_op);
+  if (status != xnn_status_success) {
+    state.SkipWithError("failed to delete ELU operator");
+    return;
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#ifdef BENCHMARK_TENSORFLOW_LITE
+static void tflite_elu_f32(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 20.0f), std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  const flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_FLOAT32),
+  }};
+  const std::array<int32_t, 1> op_inputs{{ 0 }};
+  const std::array<int32_t, 1> op_outputs{{ 1 }};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{ 0 }};
+  const std::array<int32_t, 1> graph_outputs{{ 1 }};
+  const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("ELU model"),
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<float>(0),
+    interpreter->typed_tensor<float>(0) + batch_size,
+    std::ref(f32rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(float);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+static void tflite_elu_qs8(benchmark::State& state) {
+  const size_t batch_size = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
+  flatbuffers::FlatBufferBuilder builder;
+  const flatbuffers::Offset<tflite::OperatorCode> operator_code =
+      CreateOperatorCode(builder, tflite::BuiltinOperator_ELU);
+  const std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
+    tflite::CreateBuffer(builder, builder.CreateVector({})),
+  }};
+  const std::array<int32_t, 1> shape{{
+    static_cast<int32_t>(batch_size)
+  }};
+  const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f /* scale */}),
+                           builder.CreateVector<int64_t>({1 /* zero point */}))),
+    tflite::CreateTensor(builder,
+                         builder.CreateVector<int32_t>(shape.data(), shape.size()),
+                         tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
+                         tflite::CreateQuantizationParameters(builder,
+                           0 /*min*/, 0 /*max*/,
+                           builder.CreateVector<float>({1.0f /* scale */}),
+                           builder.CreateVector<int64_t>({1 /* zero point */}))),
+  }};
+  const std::array<int32_t, 1> op_inputs{{ 0 }};
+  const std::array<int32_t, 1> op_outputs{{ 1 }};
+  flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(
+      builder,
+      0 /* opcode_index */,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+  const std::array<int32_t, 1> graph_inputs{{ 0 }};
+  const std::array<int32_t, 1> graph_outputs{{ 1 }};
+  const flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
+      builder,
+      builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
+      builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
+      builder.CreateVector(&op, 1));
+  const flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
+      TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1),
+      builder.CreateString("ELU model"),
+      builder.CreateVector(buffers.data(), buffers.size()));
+  builder.Finish(model_buffer);
+  const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  tflite::InterpreterBuilder interpreterBuilder(model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
+    state.SkipWithError("failed to create TFLite interpreter");
+    return;
+  }
+  interpreter->SetNumThreads(1);
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    state.SkipWithError("failed to allocate tensors");
+    return;
+  }
+  std::generate(
+    interpreter->typed_tensor<int8_t>(0),
+    interpreter->typed_tensor<int8_t>(0) + batch_size,
+    std::ref(i8rng));
+  for (auto _ : state) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      state.SkipWithError("failed to invoke TFLite interpreter");
+      return;
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * batch_size * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+  interpreter.reset();
+}
+#endif  // BENCHMARK_TENSORFLOW_LITE
+BENCHMARK(xnnpack_elu_f16)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_elu_f32)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+  ->UseRealTime();
+BENCHMARK(xnnpack_elu_qs8)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+  ->UseRealTime();
+#ifdef BENCHMARK_TENSORFLOW_LITE
+  BENCHMARK(tflite_elu_f32)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+    ->UseRealTime();
+  BENCHMARK(tflite_elu_qs8)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+    ->UseRealTime();
+#endif  // BENCHMARK_TENSORFLOW_LITE
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/end2end.cc ADDED Viewed

	@@ -0,0 +1,201 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/models.h>
+static void End2EndBenchmark(
+  benchmark::State& state,
+  models::ExecutionPlanFactory model_factory)
+{
+  if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  const size_t num_threads = state.range(0);
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool(
+    pthreadpool_create(num_threads), pthreadpool_destroy);
+  auto execution_plan = model_factory(threadpool.get());
+  if (execution_plan.empty()) {
+    state.SkipWithError("failed to create a model");
+    return;
+  }
+  for (auto _ : state) {
+    for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
+      xnn_status status = xnn_run_operator(op.get(), threadpool.get());
+      if (status != xnn_status_success) {
+        state.SkipWithError("failed to run a model");
+        return;
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+static void FP32MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP32MobileNetV1);
+}
+static void FP32MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP32MobileNetV2);
+}
+static void FP32MobileNetV3Large(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP32MobileNetV3Large);
+}
+static void FP32MobileNetV3Small(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP32MobileNetV3Small);
+}
+#if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
+static void FP32MobileNetV3SmallFused(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP32MobileNetV3SmallFused);
+}
+#endif  // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
+static void FP32Sparse80MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV1(0.8f, threadpool);
+  });
+}
+static void FP32Sparse80MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV2(0.8f, threadpool);
+  });
+}
+static void FP32Sparse80MobileNetV3Large(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV3Large(0.8f, threadpool);
+  });
+}
+static void FP32Sparse80MobileNetV3Small(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP32SparseMobileNetV3Small(0.8f, threadpool);
+  });
+}
+static void FP16MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP16MobileNetV1);
+}
+static void FP16MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP16MobileNetV2);
+}
+static void FP16MobileNetV3Large(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP16MobileNetV3Large);
+}
+static void FP16MobileNetV3Small(benchmark::State& state) {
+  End2EndBenchmark(state, models::FP16MobileNetV3Small);
+}
+static void FP16Sparse80MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP16SparseMobileNetV1(0.8f, threadpool);
+  });
+}
+static void FP16Sparse80MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP16SparseMobileNetV2(0.8f, threadpool);
+  });
+}
+static void FP16Sparse80MobileNetV3Large(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP16SparseMobileNetV3Large(0.8f, threadpool);
+  });
+}
+static void FP16Sparse80MobileNetV3Small(benchmark::State& state) {
+  End2EndBenchmark(state, [](pthreadpool_t threadpool) {
+    return models::FP16SparseMobileNetV3Small(0.8f, threadpool);
+  });
+}
+static void QC8MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::QC8MobileNetV1);
+}
+static void QC8MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, models::QC8MobileNetV2);
+}
+static void QS8MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::QS8MobileNetV1);
+}
+static void QS8MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, models::QS8MobileNetV2);
+}
+static void QU8MobileNetV1(benchmark::State& state) {
+  End2EndBenchmark(state, models::QU8MobileNetV1);
+}
+static void QU8MobileNetV2(benchmark::State& state) {
+  End2EndBenchmark(state, models::QU8MobileNetV2);
+}
+BENCHMARK(FP32MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP32Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16Sparse80MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16Sparse80MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16Sparse80MobileNetV3Large)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(FP16Sparse80MobileNetV3Small)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QC8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QC8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QS8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QS8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QU8MobileNetV1)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK(QU8MobileNetV2)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
+BENCHMARK(FP32MobileNetV3SmallFused)->Apply(benchmark::utils::MultiThreadingParameters)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#endif  // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/end2end.h ADDED Viewed

	@@ -0,0 +1,37 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <benchmark/benchmark.h>
+#include <xnnpack/models.h>
+#define BENCHMARK_FP16_END2END(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP16MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP16MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP16MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP16MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#define BENCHMARK_FP32_END2END(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3Large)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3Small)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#define BENCHMARK_FP32_END2END_JIT(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::FP32MobileNetV1Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::FP32MobileNetV2Jit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_large, models::FP32MobileNetV3LargeJit)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v3_small, models::FP32MobileNetV3SmallJit)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#define BENCHMARK_QS8_END2END(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QS8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QS8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+#define BENCHMARK_QU8_END2END(benchmark_fn) \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v1, models::QU8MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime(); \
+  BENCHMARK_CAPTURE(benchmark_fn, mobilenet_v2, models::QU8MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();

bench/f16-conv-hwc2chw.cc ADDED Viewed

	@@ -0,0 +1,130 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/dconv.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/conv.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/pack.h>
+static void f16_conv_hwc2chw(benchmark::State& state,
+  xnn_f16_conv_hwc2chw_ukernel_fn conv,
+  uint32_t output_channels_tile,
+  xnn_init_f16_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if ((isa_check != nullptr) && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t output_channels = state.range(2);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t input_channels = 3;
+  const size_t kernel_size = 3;
+  const size_t padding = 1;
+  const size_t subsampling = 2;
+  const size_t output_height = (input_height + 2 * padding - kernel_size) / subsampling + 1;
+  const size_t output_width = (input_width + 2 * padding - kernel_size) / subsampling + 1;
+  std::vector<uint16_t> input(input_height * input_width * input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::vector<uint16_t> kernel(output_channels * kernel_size * kernel_size * input_channels);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
+  std::vector<uint16_t> bias(output_channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f16rng));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> zero(input_channels * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t weights_elements = (kernel_size * kernel_size * input_channels + 1) *
+    benchmark::utils::RoundUp<size_t>(output_channels, output_channels_tile);
+  const size_t output_elements = output_height * output_width * output_channels;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (weights_elements + output_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(weights_elements * num_buffers);
+  std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
+  xnn_pack_f16_dconv_oki_w(
+    output_channels, input_channels, output_channels_tile,
+    kernel_size /* kernel height */, kernel_size /* kernel width */,
+    kernel.data(), bias.data(), packed_weights.data(), nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(packed_weights.cbegin(),
+      packed_weights.cbegin() + weights_elements,
+      packed_weights.begin() + n * weights_elements);
+  }
+  std::vector<uint16_t> output(output_elements * num_buffers);
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_minmax_params params;
+  init_params(&params, 0x7C00 /* inf */, 0xFC00 /* -inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    conv(
+      input_height, input_width,
+      0 /* output_y_start */, output_height /* output_y_end */,
+      input.data(), zero.data(),
+      packed_weights.data() + buffer_index * weights_elements,
+      output.data() + buffer_index * output_elements,
+      padding, output_channels,
+      output_channels * output_width * sizeof(uint16_t),
+      output_channels * sizeof(uint16_t),
+      &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      output_height * output_width *
+      input_channels * output_channels *
+      kernel_size * kernel_size,
+    benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2(benchmark::State& state, const char* net) {
+    f16_conv_hwc2chw(state, xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2, 4,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_DCONV(f16_conv_hwc2chw_3x3s2p1c3x4__neonfp16arith_2x2);
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-dwconv-e2e.cc ADDED Viewed

	@@ -0,0 +1,736 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <random>
+#include <vector>
+#include "bench/end2end.h"
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/config.h>
+#include <xnnpack/dwconv.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/models.h>
+static void DWConvEnd2EndBenchmark(
+  benchmark::State& state,
+  models::ExecutionPlanFactory model_factory,
+  xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv_minmax,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint8_t channel_tile, uint8_t primary_tile,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
+  if (dwconv_config == nullptr) {
+    state.SkipWithError("hardware does not support F16 DWCONV");
+    return;
+  }
+  // Save dwconv_config so that we can modify it for the benchmark and later restore it.
+  struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
+  memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
+  // Override microkernels chosen in xnn_initialize
+  for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
+    // Replace only the microkernel with the matching kernel size.
+    if (dwconv_config[i].primary_tile == primary_tile) {
+      std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
+      // Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
+      dwconv_config[i].minmax.unipass = xnn_dwconv_unipass_ukernel_fn(dwconv_minmax);
+      dwconv_config[i].channel_tile = channel_tile;
+      dwconv_config[i].channel_subtile = channel_tile;
+      dwconv_config[i].channel_round = 1;
+      dwconv_config[i].primary_tile = primary_tile;
+      dwconv_config[i].init.f16 = init_params;
+      break;
+    }
+  }
+  auto execution_plan = model_factory(nullptr);
+  if (execution_plan.empty()) {
+    state.SkipWithError("failed to create a model");
+    return;
+  }
+  for (auto _ : state) {
+    for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
+      xnn_status status = xnn_run_operator(op.get(), nullptr);
+      if (status != xnn_status_success) {
+        state.SkipWithError("failed to run a model");
+        return;
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  // Restore dwconv_config to original state as defined in init.c.
+  memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
+}
+static void DWConvEnd2EndBenchmark(
+  benchmark::State& state,
+  models::ExecutionPlanFactory model_factory,
+  xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv_minmax,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint8_t channel_tile, uint8_t channel_subtile, uint8_t channel_round,
+  uint8_t primary_tile, uint8_t middle_tile, uint8_t last_tile,
+  uint8_t primary_tile_to_replace,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  struct xnn_dwconv_config* dwconv_config = xnn_init_f16_dwconv_config();
+  if (dwconv_config == nullptr) {
+    state.SkipWithError("failed to initialize f16 DWCONV config");
+    return;
+  }
+  // Save dwconv_convig so that we can modify it for the benchmark and later restore it.
+  struct xnn_dwconv_config saved_dwconv_params[XNN_MAX_F16_DWCONV_UKERNELS];
+  memcpy(saved_dwconv_params, dwconv_config, sizeof(saved_dwconv_params));
+  bool found = false;
+  for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
+    if (dwconv_config[i].primary_tile == primary_tile_to_replace) {
+      found = true;
+    } else if (dwconv_config[i].last_tile != 0) {
+      // Found a multipass microkernel, replace it.
+      found = true;
+    }
+  }
+  if (!found) {
+    state.SkipWithError("can't replace with multipass");
+    return;
+  }
+  // Override microkernels chosen in xnn_initialize
+  for (size_t i = 0; i < XNN_MAX_F16_DWCONV_UKERNELS; i++) {
+    // Replace only the microkernel with the matching kernel size.
+    if (dwconv_config[i].primary_tile == primary_tile_to_replace ||
+        dwconv_config[i].last_tile != 0) {
+      // Replace either when the primary_tile_to_replace matches, or replace the
+      // first multipass dwconv microkernel we find.
+      // TODO(zhin): support specifying target multipass dwconv to replace.
+      std::memset(&dwconv_config[i], 0, sizeof(dwconv_config[i]));
+      // Note: do not directly assign to dwconv_config[i] because it breaks older gcc.
+      dwconv_config[i].minmax.multipass = xnn_dwconv_multipass_ukernel_fn(dwconv_minmax);
+      dwconv_config[i].channel_tile = channel_tile;
+      dwconv_config[i].channel_subtile = channel_subtile;
+      dwconv_config[i].channel_round = channel_round;
+      dwconv_config[i].primary_tile = primary_tile;
+      dwconv_config[i].middle_tile = middle_tile;
+      dwconv_config[i].last_tile = last_tile;
+      dwconv_config[i].init.f16 = init_params;
+      break;
+    }
+  }
+  auto execution_plan = model_factory(nullptr);
+  if (execution_plan.empty()) {
+    state.SkipWithError("failed to create a model");
+    return;
+  }
+  for (auto _ : state) {
+    for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
+      xnn_status status = xnn_run_operator(op.get(), nullptr);
+      if (status != xnn_status_success) {
+        state.SkipWithError("failed to run a model");
+        return;
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  memcpy(dwconv_config, saved_dwconv_params, sizeof(saved_dwconv_params));
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/4, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/9, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_4p8c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_4p16c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_4p32c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p8c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p16c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_9p32c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p8c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p16c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith);
+  BENCHMARK_FP16_END2END(f16_dwconv_25p32c__neonfp16arith_acc2);
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_dwconv_25p8c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p16c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p32c__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/5, /*middle_tile=*/5, /*last_tile=*/5,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/6, /*middle_tile=*/6, /*last_tile=*/7,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(
+      state, model,
+      xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*primary_tile=*/8, /*middle_tile=*/8, /*last_tile=*/9,
+      /*primary_tile_to_replace=*/25,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_25p8c__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_25p16c__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_25p32c__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3)
+  BENCHMARK_FP16_END2END(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-dwconv.cc ADDED Viewed

	@@ -0,0 +1,795 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/dwconv.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/dwconv.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microkernel-utils.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+static void f16_dwconv(benchmark::State& state,
+  xnn_f16_dwconv_minmax_unipass_ukernel_fn dwconv,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint32_t channel_tile, uint32_t primary_tile,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t channels = state.range(8);
+  const size_t kernel_size = kernel_height * kernel_width;
+  if (kernel_size > primary_tile) {
+    state.SkipWithError("kernel size mismatch");
+    return;
+  }
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
+  const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
+  const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
+  std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(channels * kernel_height * kernel_width);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t w_elements = (kernel_size + 1) * c_stride;
+  // Can read (primary_tile - kernel_size) elements after end of indirection buffer.
+  const size_t i_elements = (primary_tile - kernel_size) + output_height * step_height;
+  const size_t c_elements = output_size * channels;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), UINT16_C(0));
+  xnn_pack_f16_dwconv_ghw_w(primary_tile, 0, 0, kernel_height, kernel_width, channels,
+                            channel_tile, channel_tile, /*channel_round=*/1,
+                            k.data(), b.data(), w.data(),
+                            /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
+  convolution_op.input              = a.data();
+  convolution_op.input_pixel_stride = channels;
+  convolution_op.zero_buffer        = z.data();
+  convolution_op.input_height       = input_height;
+  convolution_op.input_width        = input_width;
+  convolution_op.output_height      = output_height;
+  convolution_op.output_width       = output_width;
+  convolution_op.kernel_height      = kernel_height;
+  convolution_op.kernel_width       = kernel_width;
+  convolution_op.stride_height      = subsampling;
+  convolution_op.stride_width       = subsampling;
+  convolution_op.dilation_height    = dilation;
+  convolution_op.dilation_width     = dilation;
+  convolution_op.padding_top        = padding_top;
+  convolution_op.padding_left       = padding_left;
+  xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, primary_tile, XNN_LOG2_SIZEOF_HALF);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_minmax_params params;
+  init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (size_t y = 0; y < output_height; y++) {
+      dwconv(channels, output_width,
+        reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
+        w.data() + buffer_index * w_elements,
+        c.data() + buffer_index * c_elements + y * output_width * channels,
+        kernel_height * step_width * sizeof(void*), 0,
+        0, z.data(), &params);
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
+    benchmark::Counter::kIsRate);
+}
+static void f16_dwconv(benchmark::State& state,
+  xnn_f16_dwconv_minmax_multipass_ukernel_fn dwconv,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint32_t first_pass_tile,
+  uint32_t middle_pass_tile,
+  uint32_t last_pass_tile,
+  uint32_t channel_tile,
+  uint32_t channel_subtile,
+  uint32_t channel_round,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t channels = state.range(8);
+  const size_t kernel_size = kernel_height * kernel_width;
+  if (kernel_size <= first_pass_tile) {
+    state.SkipWithError("kernel size mismatch");
+    return;
+  }
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t step_width = dilation == 1 ? std::min(subsampling, kernel_width) : kernel_width;
+  const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
+  std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(channels * kernel_size);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(channels + XNN_MULTIPASS_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t tile_size = xnn_dwconv_multipass_tile_size(
+    kernel_size, first_pass_tile, middle_pass_tile, last_pass_tile);
+  const size_t w_elements =
+    xnn_dwconv_multipass_weights_size(
+      tile_size, channels, channel_tile, channel_subtile, channel_round, /*bias_element_size=*/sizeof(uint16_t),
+      /*log2_filter_element_size=*/1, /*extra_weights_byte=*/0) /
+    sizeof(uint16_t);
+  // Can read (primary_tile - kernel_size) elements after end of indirection buffer.
+  const size_t i_elements = tile_size - kernel_size + output_height * step_height;
+  const size_t c_elements = output_size * channels;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), UINT16_C(0));
+  xnn_pack_f16_dwconv_ghw_w(
+    first_pass_tile, middle_pass_tile, last_pass_tile,
+    kernel_height, kernel_width,
+    channels, channel_tile, channel_subtile, channel_round,
+    k.data(), b.data(), w.data(), /*per_tile_extra_bytes=*/0, /*per_subtile_extra_bytes=*/0, nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
+  convolution_op.input              = a.data();
+  convolution_op.input_pixel_stride = channels;
+  convolution_op.zero_buffer        = z.data();
+  convolution_op.input_height       = input_height;
+  convolution_op.input_width        = input_width;
+  convolution_op.output_height      = output_height;
+  convolution_op.output_width       = output_width;
+  convolution_op.kernel_height      = kernel_height;
+  convolution_op.kernel_width       = kernel_width;
+  convolution_op.stride_height      = subsampling;
+  convolution_op.stride_width       = subsampling;
+  convolution_op.dilation_height    = dilation;
+  convolution_op.dilation_width     = dilation;
+  convolution_op.padding_top        = padding_top;
+  convolution_op.padding_left       = padding_left;
+  xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, tile_size, XNN_LOG2_SIZEOF_HALF);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_minmax_params params;
+  init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
+  const int input_advanced = tile_size - last_pass_tile;
+  const int input_stride_elements = kernel_height * step_width - input_advanced;
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (size_t y = 0; y < output_height; y++) {
+      dwconv(channels, output_width,
+        reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
+        w.data() + buffer_index * w_elements,
+        c.data() + buffer_index * c_elements + y * output_width * channels,
+        input_stride_elements * sizeof(void*), 0,
+        0, z.data(), kernel_size, buffer.data(), &params);
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
+    benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_dwconv_4p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p8c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p8c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p8c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p8c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p8c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      8, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p16c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p16c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p16c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p16c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p16c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      16, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_4p32c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_4p32c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 4, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_9p32c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_9p32c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 9, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p32c__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith_acc2,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_25p32c__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(state,
+      xnn_f16_dwconv_minmax_ukernel_25p32c__neonfp16arith,
+      xnn_init_f16_minmax_fp16arith_params,
+      32, 25, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__neonfp16arith_acc2, xnn_init_f16_minmax_fp16arith_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_4p8c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_9p8c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_25p8c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_4p16c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_9p16c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_25p16c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_4p32c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_9p32c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_25p32c__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__neonfp16arith_acc2)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_dwconv_25p8c__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p8c__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/8, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p16c__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p16c__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p16c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/16, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p32c__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_25p32c__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_25p32c__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*channel_tile=*/32, /*primary_tile=*/25, /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_5f5m5l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_5f5m5l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/5, /*middle_pass_tile=*/5, /*last_pass_tile=*/5,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_6f6m7l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_6f6m7l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/6, /*middle_pass_tile=*/6, /*last_pass_tile=*/7,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l8c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l8c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/8, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l16c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l16c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/16, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__fma3(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  static void f16_dwconv_8f8m9l32c8s4r__fma3_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv(
+      state, xnn_f16_dwconv_minmax_ukernel_8f8m9l32c8s4r__fma3_acc2, xnn_init_f16_minmax_avx_params,
+      /*first_pass_tile=*/8, /*middle_pass_tile=*/8, /*last_pass_tile=*/9,
+      /*channel_tile=*/32, /*channel_subtile=*/8, /*channel_round=*/4,
+      /*isa_check=*/benchmark::utils::CheckFMA3);
+  }
+  BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_25p8c__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_25p16c__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_25p32c__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l8c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l16c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_5f5m5l32c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l8c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l16c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_6f6m7l32c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l8c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l16c8s4r__fma3_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3)
+  BENCHMARK_DWCONV(f16_dwconv_8f8m9l32c8s4r__fma3_acc2)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-dwconv2d-chw.cc ADDED Viewed

	@@ -0,0 +1,496 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/dwconv.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/dwconv.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+static void f16_dwconv2d_chw(benchmark::State& state,
+  xnn_f16_dwconv2d_chw_ukernel_fn dwconv,
+  xnn_init_f16_chw_params_fn init_params,
+  uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if ((isa_check != nullptr) && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t channels = state.range(8);
+  if (kernel_height != kh) {
+    state.SkipWithError("kernel height mismatch");
+    return;
+  }
+  if (kernel_width != kw) {
+    state.SkipWithError("kernel width mismatch");
+    return;
+  }
+  if (subsampling != s) {
+    state.SkipWithError("subsampling mismatch");
+    return;
+  }
+  if (padding_width % 2 != 0 || padding_width / 2 != pw) {
+    state.SkipWithError("padding width mismatch");
+    return;
+  }
+  if (dilation != 1) {
+    state.SkipWithError("unsupported dilation");
+    return;
+  }
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t inputSize = (input_height + padding_height) * input_width;
+  const size_t kernel_size = kernel_height * kernel_width;
+  const size_t output_size = output_height * output_width;
+  std::vector<uint16_t> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  std::vector<uint16_t> bias(channels);
+  std::generate(bias.begin(), bias.end(), std::ref(f16rng));
+  std::vector<uint16_t> kernel(channels * kernel_size);
+  std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
+  std::vector<uint16_t> zero(input_width + padding_width);
+  const size_t w_elements = (kernel_size + 1) * channels;
+  const size_t o_elements = output_size * channels;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + o_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(w_elements * num_buffers);
+  std::fill(packed_weights.begin(), packed_weights.end(), UINT16_C(0));
+  for (size_t c = 0; c < channels; c++) {
+    packed_weights[c * kernel_size + c] = bias[c];
+    for (size_t i = 0; i < kernel_size; i++) {
+      packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
+    }
+  }
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
+  }
+  std::vector<uint16_t> output(o_elements * num_buffers);
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_chw_params chw_params;
+  init_params(&chw_params,
+    input_width, 0xFC00 /* -inf */, 0x7C00 /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t channel = 0; channel < channels; channel++) {
+      dwconv(
+        input_height, input_width * sizeof(uint16_t),
+        input.data() + channel * inputSize,
+        packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
+        zero.data(),
+        output.data() + channel * output_size + buffer_index * o_elements,
+        padding_height / 2,  // padding_top
+        &chw_params);
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
+    benchmark::Counter::kIsRate);
+  state.counters["bytes"] = benchmark::Counter(
+    uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
+    benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void dwconv2d_chw_3x3p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_3x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_4x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_5x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_5x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_6x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_6x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x8_acc4,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_3x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_3x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_4x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_4x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x8_acc4,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      3 /* kernel height */, 3 /* kernel width */, 1 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_4x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_5x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_5x8,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc4,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x8_acc5,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride1_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 1 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_3x8,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc4,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc2,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x8_acc3,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2(benchmark::State& state, const char* net) {
+    f16_dwconv2d_chw(state,
+      xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x8_acc5,
+      xnn_init_f16_chw_neonfp16arith_stride2_params,
+      5 /* kernel height */, 5 /* kernel width */, 2 /* width padding */, 2 /* stride */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_3x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_4x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_5x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_6x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_1x8_acc4)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfp16arith_2x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_3x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_4x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_1x8_acc4)
+  BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfp16arith_2x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_5x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc4)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_1x8_acc5)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_2x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_3x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfp16arith_4x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc4)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_1x8_acc5)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc2)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_2x8_acc3)
+  BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfp16arith_3x8_acc2)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-f32-vcvt.cc ADDED Viewed

	@@ -0,0 +1,414 @@

+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vcvt.h>
+static void f16_f32_vcvt(
+  benchmark::State& state,
+  xnn_f16_f32_vcvt_ukernel_fn cvt,
+  xnn_init_f16_f32_cvt_params_fn init_params = nullptr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::vector<float, AlignedAllocator<float, 64>> y(num_elements);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  std::fill(y.begin(), y.end(), std::nanf(""));
+  xnn_f16_f32_cvt_params params;
+  if (init_params != nullptr) {
+    init_params(&params);
+  }
+  for (auto _ : state) {
+    cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = num_elements * (sizeof(uint16_t) + sizeof(float));
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x8,
+                    xnn_f16_f32_vcvt_ukernel__neonfp16_x8,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neonfp16_x16,
+                    xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__neon_int16_x8,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__neon_int16_x24,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__neon_int16_x32,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__neon_int32_x8,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__neon_int32_x16,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__neon_int32_x24,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__neon_int32_x32,
+                    xnn_init_f16_f32_cvt_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x16,
+                    xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx512skx_x32,
+                    xnn_f16_f32_vcvt_ukernel__avx512skx_x32,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x8,
+                    xnn_f16_f32_vcvt_ukernel__f16c_x8,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, f16c_x16,
+                    xnn_f16_f32_vcvt_ukernel__f16c_x16,
+                    nullptr /* init params */,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__avx_int16_x8,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__avx_int16_x24,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__avx_int16_x32,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__avx_int32_x8,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__avx_int32_x16,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__avx_int32_x24,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__avx_int32_x32,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int16_x8,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int16_x24,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int16_x32,
+                    xnn_init_f16_f32_cvt_sse_int16_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int32_x8,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int32_x16,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int32_x24,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__sse41_int32_x32,
+                    xnn_init_f16_f32_cvt_sse_int32_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int16_x8,
+                    xnn_init_f16_f32_cvt_sse_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int16_x16,
+                    xnn_init_f16_f32_cvt_sse_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int16_x24,
+                    xnn_init_f16_f32_cvt_sse_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
+                    xnn_init_f16_f32_cvt_sse_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int32_x8,
+                    xnn_init_f16_f32_cvt_sse_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int32_x16,
+                    xnn_init_f16_f32_cvt_sse_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int32_x24,
+                    xnn_init_f16_f32_cvt_sse_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__sse2_int32_x32,
+                    xnn_init_f16_f32_cvt_sse_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMRELAXEDSIMD
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x8,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x16,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x24,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_x32,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x8,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x16,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x24,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_x32,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASMRELAXEDSIMD
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x8,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x8,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x16,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x24,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x24,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_x32,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x32,
+                    xnn_init_f16_f32_cvt_wasmsimd_int16_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x8,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x8,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x16,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x16,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x24,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x24,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_x32,
+                    xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_x32,
+                    xnn_init_f16_f32_cvt_wasmsimd_int32_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x1,
+                  xnn_f16_f32_vcvt_ukernel__scalar_x1,
+                  xnn_init_f16_f32_cvt_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x2,
+                  xnn_f16_f32_vcvt_ukernel__scalar_x2,
+                  xnn_init_f16_f32_cvt_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x3,
+                  xnn_f16_f32_vcvt_ukernel__scalar_x3,
+                  xnn_init_f16_f32_cvt_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_x4,
+                  xnn_f16_f32_vcvt_ukernel__scalar_x4,
+                  xnn_init_f16_f32_cvt_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
+  ->UseRealTime();
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-f32acc-gemm.cc ADDED Viewed

	@@ -0,0 +1,162 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/gemm.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+static void f16_gemm(benchmark::State& state,
+  xnn_f16_gemm_minmax_ukernel_fn gemm,
+  size_t mr, size_t nr, size_t kr, size_t sr,
+  xnn_init_f16_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t mc = state.range(0);
+  const size_t nc = state.range(1);
+  const size_t kc = state.range(2);
+  const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(nc * kc);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(nc);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  const size_t w_elements = nc_stride * kc_stride + nc_stride;
+  const size_t c_elements = mc * nc;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params,
+    UINT16_C(0xFC00)  /* -inf */, UINT16_C(0x7C00)  /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    // Use circular buffers (exceeding cache size) and prefetch to control cache state:
+    // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
+    // - W is not in cache (for any cache level)
+    // - C is not in cache (for any cache level)
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < mc; m += mr) {
+      const uint32_t mb = min(mc - m, mr);
+      for (uint32_t n = 0; n < nc; n += nr) {
+        const uint32_t nb = min(nc - n, nr);
+        gemm(
+          mb, nb, kc * sizeof(uint16_t),
+          a.data() + m * kc, kc * sizeof(uint16_t),
+          w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
+          c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
+          &params);
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_f32acc_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state, xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  BENCHMARK_GEMM(f16_f32acc_gemm_1x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_4x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_5x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_6x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_7x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_1x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_3x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_4x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_f32acc_gemm_5x16__avx2_broadcast)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-f32acc-igemm.cc ADDED Viewed

	@@ -0,0 +1,214 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/conv.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+static void f16_igemm(benchmark::State& state,
+  xnn_f16_igemm_minmax_ukernel_fn igemm,
+  uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
+  xnn_init_f16_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t kernel_size = kernel_height * kernel_width;
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t group_input_channels = state.range(8);
+  const size_t group_output_channels = state.range(9);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t output_pixel_stride = group_output_channels;
+  const size_t input_pixel_stride = group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
+  const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
+  std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(group_output_channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
+  const size_t i_elements = mc_stride * kernel_size;
+  const size_t c_elements = output_height * output_width * output_pixel_stride;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_conv_goki_w(
+    1 /* groups */, group_output_channels, kernel_size, group_input_channels,
+    nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
+  convolution_op.input                = a.data();
+  convolution_op.input_pixel_stride   = input_pixel_stride;
+  convolution_op.zero_buffer          = z.data();
+  convolution_op.groups               = 1;
+  convolution_op.group_input_channels = group_input_channels;
+  convolution_op.batch_size           = 1;
+  convolution_op.input_height         = input_height;
+  convolution_op.input_width          = input_width;
+  convolution_op.output_height        = output_height;
+  convolution_op.output_width         = output_width;
+  convolution_op.kernel_height        = kernel_height;
+  convolution_op.kernel_width         = kernel_width;
+  convolution_op.stride_height        = subsampling;
+  convolution_op.stride_width         = subsampling;
+  convolution_op.dilation_height      = dilation;
+  convolution_op.dilation_width       = dilation;
+  convolution_op.padding_top          = padding_top;
+  convolution_op.padding_left         = padding_left;
+  xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params,
+    UINT16_C(0x7C00) /* inf */, UINT16_C(0xFC00) /* -inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < output_size; m += mr) {
+      const uint32_t mb = min(output_size - m, mr);
+      for (uint32_t n = 0; n < group_output_channels; n += nr) {
+        const uint32_t nb = min(group_output_channels - n, nr);
+        igemm(
+          mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
+          reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
+          w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
+          c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
+          0, z.data(), &params);
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      output_height * output_width *
+      group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_f32acc_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast, 1, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast, 4, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast, 5, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast, 6, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast, 7, 8, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast, 1, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast, 3, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast, 4, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state, xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast, 5, 16, 1, 1,
+      xnn_init_f16_minmax_avx_params, benchmark::utils::CheckAVX2);
+  }
+  BENCHMARK_CONV(f16_f32acc_igemm_1x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_4x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_5x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_6x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_7x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_1x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_3x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_4x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_f32acc_igemm_5x16__avx2_broadcast)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-f32acc-rsum.cc ADDED Viewed

	@@ -0,0 +1,140 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/reduce.h>
+static void f16_f32acc_rsum(
+  benchmark::State& state,
+  xnn_f16_f32acc_rsum_ukernel_fn rsum,
+  xnn_init_f16_f32acc_scale_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  xnn_f16_f32acc_scale_params params;
+  init_params(&params, /*scale=*/0.1f);
+  uint16_t output = UINT16_C(0x7E00);  /* NaN */
+  for (auto _ : state) {
+    rsum(elements * sizeof(uint16_t), input.data(), &output, &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x4,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x4,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x8,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x8,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x16_acc2,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x16_acc2,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x24_acc3,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x24_acc3,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc2,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc2,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, neonfp16_x32_acc4,
+                    xnn_f16_f32acc_rsum_ukernel__neonfp16_x32_acc4,
+                    xnn_init_f16_f32acc_scale_scalar_params,
+                    benchmark::utils::CheckNEONFP16)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x8,
+                    xnn_f16_f32acc_rsum_ukernel__f16c_x8,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x16_acc2,
+                    xnn_f16_f32acc_rsum_ukernel__f16c_x16_acc2,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x24_acc3,
+                    xnn_f16_f32acc_rsum_ukernel__f16c_x24_acc3,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc2,
+                    xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc2,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_x32_acc4,
+                    xnn_f16_f32acc_rsum_ukernel__f16c_x32_acc4,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-gavgpool-cw.cc ADDED Viewed

	@@ -0,0 +1,77 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+void f16_gavgpool_cw(
+    benchmark::State& state,
+    xnn_f16_gavgpool_cw_ukernel_fn gavgpool_cw,
+    xnn_init_f16_gavgpool_neonfp16arith_params_fn init_params,
+    benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t channels = state.range(0);
+  const size_t elements = state.range(1);
+  std::vector<int16_t, AlignedAllocator<int16_t, 64>> input(elements * channels + XNN_EXTRA_BYTES / sizeof(int16_t));
+  std::vector<int16_t> output(channels);
+  std::iota(input.begin(), input.end(), 0);
+  // Prepare parameters.
+  union xnn_f16_gavgpool_params params;
+  init_params(&params,
+    UINT16_C(0x3C00) /* scale */, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */, elements);
+  for (auto _ : state) {
+    gavgpool_cw(elements, channels, input.data(), output.data(), &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+static void BenchmarkBatch(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"channels", "elements"});
+  b->Args({1, 1024});
+  b->Args({2, 1024});
+  b->Args({4, 1024});
+  b->Args({6, 1024});
+  b->Args({8, 1024});
+  b->Args({16, 1024});
+  b->Args({1024, 1024});
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_gavgpool_cw, f16_neon_x8,
+                    xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8,
+                    xnn_init_f16_gavgpool_neonfp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(BenchmarkBatch)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-gemm-e2e.cc ADDED Viewed

	@@ -0,0 +1,452 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <random>
+#include <vector>
+#include "bench/end2end.h"
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+#include <xnnpack.h>
+#include <xnnpack/config.h>
+#include <xnnpack/gemm.h>
+#include <xnnpack/igemm.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/models.h>
+#include <xnnpack/pack.h>
+static void GEMMEnd2EndBenchmark(
+  benchmark::State& state,
+  models::ExecutionPlanFactory model_factory,
+  xnn_f16_gemm_minmax_ukernel_fn gemm_minmax,
+  xnn_f16_igemm_minmax_ukernel_fn igemm_minmax,
+  xnn_f16_gemm_minmax_ukernel_fn gemm1_minmax,
+  xnn_f16_igemm_minmax_ukernel_fn igemm1_minmax,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
+    state.SkipWithError("failed to initialize XNNPACK");
+    return;
+  }
+  struct xnn_gemm_config* gemm_config = xnn_init_f16_gemm_config();
+  if (gemm_config == nullptr) {
+    state.SkipWithError("hardware does not support F16 gemm");
+    return;
+  }
+  // Override microkernels chosen in xnn_initialize
+  std::memset(gemm_config, 0, sizeof(struct xnn_gemm_config));
+  gemm_config->minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm_minmax));
+  gemm_config->minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm_minmax));
+  gemm_config->minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_fn(gemm1_minmax));
+  gemm_config->minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_fn(igemm1_minmax));
+  gemm_config->init.f16 = init_params;
+  gemm_config->mr = mr;
+  gemm_config->nr = nr;
+  gemm_config->log2_kr = log2_kr;
+  gemm_config->log2_sr = log2_sr;
+  gemm_config->pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f16_gemm_goi_w;
+  auto execution_plan = model_factory(nullptr);
+  if (execution_plan.empty()) {
+    state.SkipWithError("failed to create a model");
+    return;
+  }
+  for (auto _ : state) {
+    for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
+      xnn_status status = xnn_run_operator(op.get(), nullptr);
+      if (status != xnn_status_success) {
+        state.SkipWithError("failed to run a model");
+        return;
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
+  static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_FP16_END2END(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32);
+  BENCHMARK_FP16_END2END(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75);
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64 & XNN_ENABLE_ASSEMBLY
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
+      xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_FP16_END2END(f16_gemm_4x8__neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_6x8__neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_8x8__neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_4x16__neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_6x16__neonfp16arith_ld64);
+  BENCHMARK_FP16_END2END(f16_gemm_8x16__neonfp16arith_ld64);
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_4x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_5x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_6x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_7x8__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_3x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_4x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_f32acc_gemm_5x16__avx2_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f16_f32acc_gemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_f16_f32acc_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_f16_f32acc_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckAVX2);
+  }
+  BENCHMARK_FP16_END2END(f16_gemm_4x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_5x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_6x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_7x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_3x16__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_4x16__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_gemm_5x16__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_6x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_7x8__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_3x16__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_4x16__avx2_broadcast);
+  BENCHMARK_FP16_END2END(f16_f32acc_gemm_5x16__avx2_broadcast);
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-gemm.cc ADDED Viewed

	@@ -0,0 +1,513 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/gemm.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+static void f16_gemm(benchmark::State& state,
+  xnn_f16_gemm_minmax_ukernel_fn gemm,
+  xnn_init_f16_minmax_params_fn init_params,
+  size_t mr, size_t nr, size_t kr, size_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t mc = state.range(0);
+  const size_t nc = state.range(1);
+  const size_t kc = state.range(2);
+  const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(nc * kc);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(nc);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  const size_t w_elements = nc_stride * kc_stride + nc_stride;
+  const size_t c_elements = mc * nc;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params,
+    UINT16_C(0xFC00)  /* -inf */, UINT16_C(0x7C00)  /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    // Use circular buffers (exceeding cache size) and prefetch to control cache state:
+    // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
+    // - W is not in cache (for any cache level)
+    // - C is not in cache (for any cache level)
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < mc; m += mr) {
+      const uint32_t mb = min(mc - m, mr);
+      for (uint32_t n = 0; n < nc; n += nr) {
+        const uint32_t nb = min(nc - n, nr);
+        gemm(
+          mb, nb, kc * sizeof(uint16_t),
+          a.data() + m * kc, kc * sizeof(uint16_t),
+          w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
+          c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
+          &params);
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+#if XNN_PLATFORM_JIT
+static void f16_gemm(benchmark::State& state,
+  xnn_jit_gemm_code_generator_fn generator,
+  xnn_init_f16_minmax_params_fn init_params,
+  size_t mr, size_t nr, size_t kr, size_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t mc = state.range(0);
+  const size_t nc = state.range(1);
+  const size_t kc = state.range(2);
+  const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(nc * kc);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(nc);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  const size_t w_elements = nc_stride * kc_stride + nc_stride;
+  const size_t c_elements = mc * nc;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params,
+    UINT16_C(0xFC00)  /* -inf */, UINT16_C(0x7C00)  /* inf */);
+  jit_gemm_params jit_params = {};
+  jit_params.f16_minmax.min = UINT16_C(0xFC00);  /* -inf */
+  jit_params.f16_minmax.max = UINT16_C(0x7C00);  /* inf */
+  xnn_code_buffer code_buffer;
+  xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
+  generator(&code_buffer, mr, nc % nr, kc * sizeof(float), &jit_params);
+  xnn_finalize_code_memory(&code_buffer);
+  xnn_f16_gemm_minmax_ukernel_fn gemm = reinterpret_cast<xnn_f16_gemm_minmax_ukernel_fn>(code_buffer.start);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    // Use circular buffers (exceeding cache size) and prefetch to control cache state:
+    // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
+    // - W is not in cache (for any cache level)
+    // - C is not in cache (for any cache level)
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < mc; m += mr) {
+      const uint32_t mb = min(mc - m, mr);
+      for (uint32_t n = 0; n < nc; n += nr) {
+        const uint32_t nb = min(nc - n, nr);
+        gemm(
+          mb, nb, kc * sizeof(uint16_t),
+          a.data() + m * kc, kc * sizeof(uint16_t),
+          w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
+          c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
+          &params);
+      }
+    }
+  }
+  xnn_release_code_memory(&code_buffer);
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+#endif  // XNN_PLATFORM_JIT
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+  static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_8x8__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld32)
+  BENCHMARK_GEMM(f16_gemm_1x16__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld32)
+  BENCHMARK_GEMM(f16_gemm_4x16__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
+  BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
+  BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
+  BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld32)
+  BENCHMARK_GEMM(f16_gemm_6x16__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_1x8__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x8__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x8__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_8x8__asm_aarch64_neonfp16arith_ld64)
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_gemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_GEMM(f16_gemm_1x8__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x8__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x8__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_8x8__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_1x16__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x16__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x16__neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_8x16__neonfp16arith_ld64)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_gemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_gemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  BENCHMARK_GEMM(f16_gemm_1x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_4x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_5x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_6x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_7x8__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_1x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_3x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_4x16__avx2_broadcast)
+  BENCHMARK_GEMM(f16_gemm_5x16__avx2_broadcast)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+  static void f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
+    f16_gemm(state,
+      xnn_generate_f16_gemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_GEMM(f16_gemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
+  BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
+  BENCHMARK_GEMM(f16_gemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
+#endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-igemm.cc ADDED Viewed

	@@ -0,0 +1,588 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/conv.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+static void f16_igemm(benchmark::State& state,
+  xnn_f16_igemm_minmax_ukernel_fn igemm,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t kernel_size = kernel_height * kernel_width;
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t group_input_channels = state.range(8);
+  const size_t group_output_channels = state.range(9);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t output_pixel_stride = group_output_channels;
+  const size_t input_pixel_stride = group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
+  const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
+  std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(group_output_channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
+  const size_t i_elements = mc_stride * kernel_size;
+  const size_t c_elements = output_height * output_width * output_pixel_stride;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_conv_goki_w(
+    1 /* groups */, group_output_channels, kernel_size, group_input_channels,
+    nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
+  convolution_op.input                = a.data();
+  convolution_op.input_pixel_stride   = input_pixel_stride;
+  convolution_op.zero_buffer          = z.data();
+  convolution_op.groups               = 1;
+  convolution_op.group_input_channels = group_input_channels;
+  convolution_op.batch_size           = 1;
+  convolution_op.input_height         = input_height;
+  convolution_op.input_width          = input_width;
+  convolution_op.output_height        = output_height;
+  convolution_op.output_width         = output_width;
+  convolution_op.kernel_height        = kernel_height;
+  convolution_op.kernel_width         = kernel_width;
+  convolution_op.stride_height        = subsampling;
+  convolution_op.stride_width         = subsampling;
+  convolution_op.dilation_height      = dilation;
+  convolution_op.dilation_width       = dilation;
+  convolution_op.padding_top          = padding_top;
+  convolution_op.padding_left         = padding_left;
+  xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < output_size; m += mr) {
+      const uint32_t mb = min(output_size - m, mr);
+      for (uint32_t n = 0; n < group_output_channels; n += nr) {
+        const uint32_t nb = min(group_output_channels - n, nr);
+        igemm(
+          mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
+          reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
+          w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
+          c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
+          0, z.data(), &params);
+      }
+    }
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      output_height * output_width *
+      group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+#if XNN_PLATFORM_JIT
+static void f16_igemm(benchmark::State& state,
+  xnn_jit_igemm_code_generator_fn generator,
+  xnn_init_f16_minmax_params_fn init_params,
+  uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t kernel_size = kernel_height * kernel_width;
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t group_input_channels = state.range(8);
+  const size_t group_output_channels = state.range(9);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t output_pixel_stride = group_output_channels;
+  const size_t input_pixel_stride = group_input_channels;
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
+  const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
+  const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
+  std::vector<uint16_t> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(group_output_channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+  std::vector<uint16_t> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
+  const size_t i_elements = mc_stride * kernel_size;
+  const size_t c_elements = output_height * output_width * output_pixel_stride;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0);
+  xnn_pack_f16_conv_goki_w(
+    1 /* groups */, group_output_channels, kernel_size, group_input_channels,
+    nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
+  convolution_op.input                = a.data();
+  convolution_op.input_pixel_stride   = input_pixel_stride;
+  convolution_op.zero_buffer          = z.data();
+  convolution_op.groups               = 1;
+  convolution_op.group_input_channels = group_input_channels;
+  convolution_op.batch_size           = 1;
+  convolution_op.input_height         = input_height;
+  convolution_op.input_width          = input_width;
+  convolution_op.output_height        = output_height;
+  convolution_op.output_width         = output_width;
+  convolution_op.kernel_height        = kernel_height;
+  convolution_op.kernel_width         = kernel_width;
+  convolution_op.stride_height        = subsampling;
+  convolution_op.stride_width         = subsampling;
+  convolution_op.dilation_height      = dilation;
+  convolution_op.dilation_width       = dilation;
+  convolution_op.padding_top          = padding_top;
+  convolution_op.padding_left         = padding_left;
+  xnn_indirection_init_conv2d(&convolution_op, mr, XNN_LOG2_SIZEOF_HALF);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
+  // Prepare minmax parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
+  jit_gemm_params jit_params = {};
+  jit_params.f16_minmax.min = UINT16_C(0xFC00);  /* -inf */
+  jit_params.f16_minmax.max = UINT16_C(0x7C00);  /* inf */
+  xnn_code_buffer code_buffer;
+  xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
+  generator(&code_buffer,
+            mr,
+            group_output_channels % nr,
+            group_input_channels * sizeof(uint16_t),
+            kernel_size * mr * sizeof(void *),
+            &jit_params);
+  xnn_finalize_code_memory(&code_buffer);
+  auto igemm = reinterpret_cast<xnn_f16_igemm_minmax_ukernel_fn>(code_buffer.start);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+    for (uint32_t m = 0; m < output_size; m += mr) {
+      const uint32_t mb = min(output_size - m, mr);
+      for (uint32_t n = 0; n < group_output_channels; n += nr) {
+        const uint32_t nb = min(group_output_channels - n, nr);
+        igemm(
+          mb, nb, group_input_channels * sizeof(uint16_t), kernel_size * mr * sizeof(void*),
+          reinterpret_cast<const void**>(i.data()) + buffer_index * i_elements + m,
+          w.data() + buffer_index * w_elements + n * (kc_stride * kernel_size + 1),
+          c.data() + buffer_index * c_elements + m * group_output_channels + n, group_output_channels * sizeof(uint16_t), nr * sizeof(uint16_t),
+          0, z.data(), &params);
+      }
+    }
+  }
+  xnn_release_code_memory(&code_buffer);
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 *
+      output_height * output_width *
+      group_input_channels * group_output_channels *
+      kernel_height * kernel_width,
+    benchmark::Counter::kIsRate);
+}
+#endif  // XNN_PLATFORM_JIT
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+  static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a55r0,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_cortex_a75,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld32,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x16__asm_aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55)
+  BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a55r0)
+  BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_cortex_a75)
+  BENCHMARK_CONV(f16_igemm_6x16__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld32)
+  BENCHMARK_CONV(f16_igemm_4x16__asm_aarch64_neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld32)
+  BENCHMARK_CONV(f16_igemm_1x16__asm_aarch64_neonfp16arith_ld64)
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void f16_igemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/8, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/8, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_CONV(f16_igemm_1x8__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_4x8__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_6x8__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_8x8__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_1x16__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_4x16__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_6x16__neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_8x16__neonfp16arith_ld64)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f16_igemm_1x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/1, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_4x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/4, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_5x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/5, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_6x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/6, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_7x8__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/7, /*nr=*/8, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_1x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_3x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/3, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_4x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  static void f16_igemm_5x16__avx2_broadcast(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast,
+      xnn_init_f16_minmax_avx_params,
+      /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckAVX2);
+  }
+  BENCHMARK_CONV(f16_igemm_1x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_4x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_5x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_6x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_7x8__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_1x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_3x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_4x16__avx2_broadcast)
+  BENCHMARK_CONV(f16_igemm_5x16__avx2_broadcast)
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+  static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/5, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_cortex_a75,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_6x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/6, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_4x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/4, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    f16_igemm(state,
+      xnn_generate_f16_igemm_ukernel_1x16__aarch64_neonfp16arith_ld64,
+      xnn_init_f16_minmax_fp16arith_params,
+      /*mr=*/1, /*nr=*/16, /*kr=*/1, /*sr=*/1,
+      benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
+  BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55)
+  BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a55r0)
+  BENCHMARK_CONV(f16_igemm_6x16_5x16__jit_aarch64_neonfp16arith_cortex_a55r0)
+  BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_cortex_a75)
+  BENCHMARK_CONV(f16_igemm_6x16_6x16__jit_aarch64_neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_4x16_4x16__jit_aarch64_neonfp16arith_ld64)
+  BENCHMARK_CONV(f16_igemm_1x16_1x16__jit_aarch64_neonfp16arith_ld64)
+#endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-raddstoreexpminusmax.cc ADDED Viewed

	@@ -0,0 +1,387 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+#include <xnnpack/rmax.h>
+static void f16_raddstoreexpminusmax(
+  benchmark::State& state,
+  xnn_f16_rmax_ukernel_fn rmax,
+  xnn_f16_raddstoreexpminusmax_ukernel_fn raddstoreexpminusmax,
+  xnn_init_f16_expminus_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t elements = state.range(0);
+  const size_t cache_line_size_max = 128;
+  const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(uint16_t));
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-100.0f, 100.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(uint16_t));
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(packed_elements * num_buffers);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  benchmark::utils::DisableDenormals();
+  xnn_f16_expminus_params params;
+  init_params(&params);
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    uint16_t x_max = UINT16_C(0x7E00) /* NaN */;
+    rmax(elements * sizeof(uint16_t), x.data(), &x_max);
+    if (++buffer_index == num_buffers) {
+      buffer_index = 0;
+    }
+    state.ResumeTiming();
+    uint16_t y_sum = UINT16_C(0x7E00) /* NaN */;
+    raddstoreexpminusmax(elements * sizeof(uint16_t), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x32_acc4,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x40_acc5,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x48_acc3,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x64_acc4,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x64_acc4,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x72_acc3,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x72_acc3,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x80_acc5,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x80_acc5,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc2,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc2,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc3,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc3,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, neonfp16arith_rr2_p2_x96_acc6,
+                    xnn_f16_rmax_ukernel__neonfp16arith,
+                    xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x96_acc6,
+                    xnn_init_f16_expminus_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x32_acc4,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x32_acc4,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x40_acc5,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40_acc5,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x48_acc3,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x48_acc3,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x64_acc4,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x72_acc3,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x80_acc5,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc2,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc3,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_raddstoreexpminusmax, avx2_rr1_p2_x96_acc6,
+                    xnn_f16_rmax_ukernel__f16c,
+                    xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6,
+                    xnn_init_f16_expminus_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-rsum.cc ADDED Viewed

	@@ -0,0 +1,101 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/reduce.h>
+static void f16_rsum(
+  benchmark::State& state,
+  xnn_f16_rsum_ukernel_fn rsum,
+  xnn_init_f16_scale_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(elements);
+  std::generate(input.begin(), input.end(), std::ref(f16rng));
+  xnn_f16_scale_params params;
+  init_params(&params, /*scale=*/fp16_ieee_from_fp32_value(0.1f));
+  uint16_t output = UINT16_C(0x7E00);  /* NaN */
+  for (auto _ : state) {
+    rsum(elements * sizeof(uint16_t), input.data(), &output, &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x8,
+                    xnn_f16_rsum_ukernel__neonfp16arith_x8,
+                    xnn_init_f16_scale_fp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x16_acc2,
+                    xnn_f16_rsum_ukernel__neonfp16arith_x16_acc2,
+                    xnn_init_f16_scale_fp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x24_acc3,
+                    xnn_f16_rsum_ukernel__neonfp16arith_x24_acc3,
+                    xnn_init_f16_scale_fp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc2,
+                    xnn_f16_rsum_ukernel__neonfp16arith_x32_acc2,
+                    xnn_init_f16_scale_fp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_rsum, neonfp16arith_x32_acc4,
+                    xnn_f16_rsum_ukernel__neonfp16arith_x32_acc4,
+                    xnn_init_f16_scale_fp16arith_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::ReductionParameters<uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-spmm.cc ADDED Viewed

	@@ -0,0 +1,247 @@

+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/spmm.h"
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/spmm.h>
+static inline bool is_fp16_zero(uint16_t x) {
+  const uint16_t two_x = x + x;
+  return two_x == 0;
+}
+static void f16_spmm(benchmark::State& state,
+  xnn_f16_spmm_minmax_ukernel_fn spmm, uint32_t mr, uint32_t nr, float sparsity,
+  xnn_init_f16_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t mc = state.range(0);
+  const size_t nc = state.range(1);
+  const size_t kc = state.range(2);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  std::uniform_real_distribution<float> f32dist;
+  std::uniform_real_distribution<float> pdist;
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(kc * mc);
+  // Think of b as (n/nr + n % nr) x k, expansion happens later.
+  const size_t ncols = nc / nr + nc % nr;
+  std::vector<uint16_t> b(ncols * kc);
+  std::vector<uint16_t> bias(nc);
+  // Number of non-zero weights per N (output channel).
+  std::vector<uint32_t> nmap(nc);
+  // Mapping from index of non-zero weight to increment of K (input channel) following this index.
+  std::vector<int32_t> dmap(nc * kc);
+  std::vector<uint16_t> w(nc * kc + nc);
+  std::vector<uint16_t> output(nc * mc);
+  std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
+  std::generate(b.begin(), b.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
+  std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
+  std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+  std::fill(nmap.begin(), nmap.end(), 0);
+  std::fill(dmap.begin(), dmap.end(), 0);
+  std::fill(w.begin(), w.end(), 0);
+  for (uint16_t& b_value : b) {
+    if (pdist(rng) <= sparsity) {
+      b_value = 0;
+    }
+  }
+  uint32_t nnz = 0;
+  uint32_t wcnt = 0;
+  size_t last_kk = 0;
+  bool first_nzz = true;
+  size_t first_kk = 0;
+  for (size_t nn = 0; nn < nc / nr; nn++) {
+    for (size_t i = 0; i < nr; ++i)
+      w[wcnt++] = bias[nr * nn + i];
+    for (size_t kk = 0; kk < kc; kk++) {
+      if (!is_fp16_zero(b[nn * kc + kk])) {
+        // Every non-zero actually corresponds to nr adjacent non-zeros.
+        for (size_t i = 0; i < nr; ++i)
+          w[wcnt++] = fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
+        // Skip the very first non-zero weight as we record only the difference.
+        if (first_nzz) {
+          first_kk = kk;
+        } else {
+          const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
+          dmap[nnz++] = increment;
+        }
+        last_kk = kk;
+        first_nzz = false;
+        nmap[nn] += 1;
+      }
+    }
+  }
+  // now we've constructed the matrix for the blocked part and switch to the
+  // leftovers, which we do as nr=1 always.
+  for (size_t nn = nc / nr; nn < ncols; nn++) {
+    w[wcnt++] = bias[(nc / nr) * nr + (nn - nc / nr)];
+    for (size_t kk = 0; kk < kc; kk++) {
+      if (!is_fp16_zero(b[nn * kc + kk])) {
+        // Every non-zero actually corresponds to nr adjacent non-zeros.
+        w[wcnt++] = b[nn * kc + kk];
+        // Skip the very first non-zero weight as we record only the difference.
+        if (first_nzz) {
+          first_kk = kk;
+        } else {
+          const int32_t increment = int32_t(kk - last_kk) * int32_t(mc * sizeof(uint16_t));
+          dmap[nnz++] = increment;
+        }
+        last_kk = kk;
+        first_nzz = false;
+        nmap[nn] += 1;
+      }
+    }
+  }
+  // In the end, we must return input pointer to the initial value.
+  const int64_t increment = int32_t(first_kk - last_kk) * int32_t(mc * sizeof(uint16_t));
+  dmap[nnz++] = increment;
+  // Generate expanded b which will be used in reference calculation.
+  // Everywhere there is input non-zero in the original we copy it and add an
+  // adjacent non-zero with incremented weight value.
+  std::vector<uint16_t> b_full(nc * kc);
+  if (nr == 1) {
+     b_full = b;
+  }
+  else {
+    for (size_t nn = 0; nn < nc / nr; nn++) {
+      for (size_t kk = 0; kk < kc; kk++) {
+        if (b[nn * kc + kk] != 0.0f) {
+          for (size_t i = 0; i < nr; ++i)
+            b_full[nr * nn * kc + i * kc + kk] = fp16_ieee_from_fp32_value(
+              fp16_ieee_to_fp32_value(b[nn * kc + kk]) + static_cast<float>(i));
+        }
+      }
+    }
+    for (size_t nn = nc / nr; nn < ncols; nn++) {
+      for (size_t kk = 0; kk < kc; kk++) {
+        if (b[nn * kc + kk] != 0.0f) {
+          b_full[nr * (nc / nr) * kc + (nn - nc / nr) * kc + kk] = b[nn * kc + kk];
+        }
+      }
+    }
+  }
+  // Micro-kernel can access one element beyond w and dmap for software pipelining.
+  w.resize(wcnt + 1);
+  dmap.resize(nnz + 1);
+  // Prepare parameters.
+  xnn_f16_minmax_params params;
+  init_params(&params, 0xFC00 /* -inf */, 0x7C00 /* inf */);
+  for (auto _ : state) {
+    spmm(mc * sizeof(uint16_t), nc,
+      input.data() + first_kk * mc,
+      w.data(), dmap.data(), nmap.data(),
+      output.data(), mc * sizeof(uint16_t),
+      &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nnz, benchmark::Counter::kIsRate);
+  state.counters["EffFLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_8x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_pipelined, 8, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_8x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_x2, 8, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_16x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_pipelined, 16, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_16x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_x2, 16, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_24x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_pipelined, 24, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_24x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_x2, 24, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_32x1__neonfp16arith_pipelined(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_pipelined, 32, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  static void spmm80_32x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
+    f16_spmm(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_x2, 32, 1, 0.8f,
+      xnn_init_f16_minmax_fp16arith_params, benchmark::utils::CheckNEONFP16ARITH);
+  }
+  BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_pipelined)
+  BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_pipelined)
+  BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_pipelined)
+  BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_pipelined)
+  BENCHMARK_SPMM(spmm80_8x1__neonfp16arith)
+  BENCHMARK_SPMM(spmm80_16x1__neonfp16arith)
+  BENCHMARK_SPMM(spmm80_24x1__neonfp16arith)
+  BENCHMARK_SPMM(spmm80_32x1__neonfp16arith)
+  BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_x2)
+  BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_x2)
+  BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_x2)
+  BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_x2)
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-velu.cc ADDED Viewed

	@@ -0,0 +1,104 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vunary.h>
+static void f16_velu(
+  benchmark::State& state,
+  xnn_f16_velu_ukernel_fn elu,
+  xnn_init_f16_elu_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-9.0f, 9.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+  union xnn_f16_elu_params params;
+  init_params(&params,
+    UINT16_C(0x3C00)  /* prescale = 1.0h */,
+    UINT16_C(0x3C00)  /* alpha = 1.0h */,
+    UINT16_C(0x3C00)  /* beta = 1.0h */);
+  for (auto _ : state) {
+    elu(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x8,
+                    xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x8,
+                    xnn_init_f16_elu_fp16arith_rr1_p3_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_velu, neonfp16arith_rr1_p3_x16,
+                    xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
+                    xnn_init_f16_elu_fp16arith_rr1_p3_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x8,
+                    xnn_f16_velu_ukernel__avx2_rr1_p3_x8,
+                    xnn_init_f16_elu_avx2_rr1_p3_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_velu, avx2_rr1_p3_x16,
+                    xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
+                    xnn_init_f16_elu_avx2_rr1_p3_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-vsigmoid.cc ADDED Viewed

	@@ -0,0 +1,319 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vunary.h>
+static void f16_vsigmoid(
+  benchmark::State& state,
+  xnn_f16_vsigmoid_ukernel_fn sigmoid,
+  xnn_init_f16_sigmoid_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_sigmoid_params params;
+  init_params(&params);
+  for (auto _ : state) {
+    sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x8,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x8,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x16,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x16,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x24,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x24,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x32,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x32,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x40,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x40,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x48,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x48,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x56,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x56,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, aarch64_neonfp16arith_rr2_p2_div_x64,
+                    xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_x64,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x8,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x8,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x16,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x16,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x24,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x24,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x32,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x32,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x40,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x48,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x48,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x56,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x56,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x64,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x64,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x8,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x8,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x16,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x24,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x24,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x32,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x32,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x40,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x40,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x48,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x48,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x56,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x56,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x64,
+                    xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x64,
+                    xnn_init_f16_sigmoid_fp16arith_rr2_p2_params,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x8,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x8,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x16,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x16,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x24,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x24,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x32,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x32,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x40,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x40,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x48,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x48,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x56,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x56,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x64,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x64,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x8,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x8,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x16,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x16,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x24,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x24,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x32,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x40,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x40,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x48,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x48,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x56,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x56,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x64,
+                    xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x64,
+                    xnn_init_f16_sigmoid_avx2_rr1_p2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-vsqrt.cc ADDED Viewed

	@@ -0,0 +1,121 @@

+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vunary.h>
+static void f16_vsqrt(
+  benchmark::State& state,
+  xnn_f16_vsqrt_ukernel_fn sqrt,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+  for (auto _ : state) {
+    sqrt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x8,
+                    xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x8,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, aarch64_neonfp16arith_sqrt_x16,
+                    xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_x16,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x8,
+                    xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x8,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x16,
+                    xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x16,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x24,
+                    xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x24,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, neonfp16arith_nr1fma1adj_x32,
+                    xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_x32,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x1,
+                    xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x1,
+                    benchmark::utils::CheckFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x2,
+                    xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x2,
+                    benchmark::utils::CheckFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vsqrt, fp16arith_sqrt_x4,
+                    xnn_f16_vsqrt_ukernel__fp16arith_sqrt_x4,
+                    benchmark::utils::CheckFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

bench/f16-vtanh.cc ADDED Viewed

	@@ -0,0 +1,807 @@

+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack.h>
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/microparams-init.h>
+#include <xnnpack/vunary.h>
+static void f16_vtanh(
+  benchmark::State& state,
+  xnn_f16_vtanh_ukernel_fn tanh,
+  xnn_init_f16_tanh_params_fn init_params = nullptr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+  const size_t num_elements = state.range(0);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.0f, 5.0f), std::ref(rng));
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
+  std::generate(x.begin(), x.end(), std::ref(f16rng));
+  std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+  xnn_f16_tanh_params params;
+  if (init_params != nullptr) {
+    init_params(&params);
+  }
+  for (auto _ : state) {
+    tanh(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
+  }
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+  const size_t elements_per_iteration = num_elements;
+  state.counters["elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+  const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, avx2_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_f16_vtanh_ukernel__avx2_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_f16_vtanh_ukernel__fma3_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x8,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x8,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x16,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x16,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x24,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x24,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x32,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x32,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x40,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x40,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x48,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x48,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x56,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x56,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x64,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x64,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x72,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x72,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, fma3_polynomial_p19h9t2_x80,
+                    xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_x80,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckFMA3)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x8,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x16,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x24,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x32,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x40,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x48,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x56,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x64,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x72,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_x80,
+                    xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x8,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x8,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x16,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x16,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x24,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x24,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x32,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x32,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x40,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x40,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x48,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x48,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x56,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x56,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x64,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x64,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x72,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x72,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, f16c_polynomial_p19h9t2_x80,
+                    xnn_f16_vtanh_ukernel__f16c_polynomial_p19h9t2_x80,
+                    xnn_init_f16_tanh_avx_polynomial_p19h9t2_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x8,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x16,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x24,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x32,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x40,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x48,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x56,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x64,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x72,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
+                    xnn_f16_vtanh_ukernel__aarch64_neonfp16arith_expm1minus_rr1_p3h2ts_div_x80,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && XNN_ARCH_ARM64
+#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x8,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x16,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x24,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x32,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x40,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x48,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x56,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x64,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x72,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1fma_x80,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x8,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x16,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x24,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x32,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x40,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x48,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x56,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x64,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x72,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_nr1recps_x80,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x8,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x16,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x24,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x32,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x40,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x48,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x56,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x64,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x72,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(f16_vtanh, neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
+                    xnn_f16_vtanh_ukernel__neonfp16arith_expm1minus_rr1_p3h2ts_recpeadj_x80,
+                    nullptr,
+                    benchmark::utils::CheckNEONFP16ARITH)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+    ->UseRealTime();
+#endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif