test / src /configs /dwconv-config.c
Androidonnxfork's picture
Upload folder using huggingface_hub
8b7c501
raw
history blame
82.5 kB
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <stddef.h>
#ifdef _WIN32
#include <windows.h>
#else
#include <pthread.h>
#endif
#ifndef __EMSCRIPTEN__
#include <cpuinfo.h>
#endif
#include <xnnpack/common.h>
#include <xnnpack/config.h>
#include <xnnpack/microparams-init.h>
#include <xnnpack/dwconv.h>
static struct xnn_dwconv_config f16_dwconv_config[XNN_MAX_F16_DWCONV_UKERNELS] = {0};
static struct xnn_dwconv_config f32_dwconv_config[XNN_MAX_F32_DWCONV_UKERNELS] = {0};
static struct xnn_dwconv_config qs8_qc8w_dwconv_config[XNN_MAX_QC8_DWCONV_UKERNELS] = {0};
static struct xnn_dwconv_config qs8_dwconv_config[XNN_MAX_QS8_DWCONV_UKERNELS] = {0};
static struct xnn_dwconv_config qu8_dwconv_config[XNN_MAX_QU8_DWCONV_UKERNELS] = {0};
#if XNN_PLATFORM_WINDOWS
static INIT_ONCE init_guard_f16_dwconv = INIT_ONCE_STATIC_INIT;
static INIT_ONCE init_guard_f32_dwconv = INIT_ONCE_STATIC_INIT;
static INIT_ONCE init_guard_qs8_qc8w_dwconv = INIT_ONCE_STATIC_INIT;
static INIT_ONCE init_guard_qs8_dwconv = INIT_ONCE_STATIC_INIT;
static INIT_ONCE init_guard_qu8_dwconv = INIT_ONCE_STATIC_INIT;
#else
static pthread_once_t init_guard_f16_dwconv = PTHREAD_ONCE_INIT;
static pthread_once_t init_guard_f32_dwconv = PTHREAD_ONCE_INIT;
static pthread_once_t init_guard_qs8_qc8w_dwconv = PTHREAD_ONCE_INIT;
static pthread_once_t init_guard_qs8_dwconv = PTHREAD_ONCE_INIT;
static pthread_once_t init_guard_qu8_dwconv = PTHREAD_ONCE_INIT;
#endif
static void init_f16_dwconv_config(void) {
#if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon_fp16_arith) {
f16_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith;
f16_dwconv_config[0].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[0].channel_tile = 16;
f16_dwconv_config[0].channel_subtile = 16;
f16_dwconv_config[0].channel_round = 1;
f16_dwconv_config[0].primary_tile = 3;
f16_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith;
f16_dwconv_config[1].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[1].channel_tile = 16;
f16_dwconv_config[1].channel_subtile = 16;
f16_dwconv_config[1].channel_round = 1;
f16_dwconv_config[1].primary_tile = 4;
f16_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p8c__neonfp16arith;
f16_dwconv_config[2].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[2].channel_tile = 8;
f16_dwconv_config[2].channel_subtile = 8;
f16_dwconv_config[2].channel_round = 1;
f16_dwconv_config[2].primary_tile = 9;
f16_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2;
f16_dwconv_config[3].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[3].channel_tile = 8;
f16_dwconv_config[3].channel_subtile = 8;
f16_dwconv_config[3].channel_round = 1;
f16_dwconv_config[3].primary_tile = 25;
}
#elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon_fp16_arith) {
f16_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__neonfp16arith;
f16_dwconv_config[0].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[0].channel_tile = 16;
f16_dwconv_config[0].channel_subtile = 16;
f16_dwconv_config[0].channel_round = 1;
f16_dwconv_config[0].primary_tile = 3;
f16_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__neonfp16arith;
f16_dwconv_config[1].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[1].channel_tile = 16;
f16_dwconv_config[1].channel_subtile = 16;
f16_dwconv_config[1].channel_round = 1;
f16_dwconv_config[1].primary_tile = 4;
f16_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p16c__neonfp16arith;
f16_dwconv_config[2].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[2].channel_tile = 16;
f16_dwconv_config[2].channel_subtile = 16;
f16_dwconv_config[2].channel_round = 1;
f16_dwconv_config[2].primary_tile = 9;
f16_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__neonfp16arith_acc2;
f16_dwconv_config[3].init.f16 = xnn_init_f16_minmax_fp16arith_params;
f16_dwconv_config[3].channel_tile = 8;
f16_dwconv_config[3].channel_subtile = 8;
f16_dwconv_config[3].channel_round = 1;
f16_dwconv_config[3].primary_tile = 25;
}
#elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_x86_avx2) {
f16_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_3p16c__fma3;
f16_dwconv_config[0].init.f16 = xnn_init_f16_minmax_avx_params;
f16_dwconv_config[0].channel_tile = 16;
f16_dwconv_config[0].channel_subtile = 16;
f16_dwconv_config[0].channel_round = 1;
f16_dwconv_config[0].primary_tile = 3;
f16_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_4p16c__fma3;
f16_dwconv_config[1].init.f16 = xnn_init_f16_minmax_avx_params;
f16_dwconv_config[1].channel_tile = 16;
f16_dwconv_config[1].channel_subtile = 16;
f16_dwconv_config[1].channel_round = 1;
f16_dwconv_config[1].primary_tile = 4;
f16_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_9p16c__fma3;
f16_dwconv_config[2].init.f16 = xnn_init_f16_minmax_avx_params;
f16_dwconv_config[2].channel_tile = 16;
f16_dwconv_config[2].channel_subtile = 16;
f16_dwconv_config[2].channel_round = 1;
f16_dwconv_config[2].primary_tile = 9;
f16_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f16_dwconv_minmax_ukernel_25p8c__fma3_acc2;
f16_dwconv_config[3].init.f16 = xnn_init_f16_minmax_avx_params;
f16_dwconv_config[3].channel_tile = 8;
f16_dwconv_config[3].channel_subtile = 8;
f16_dwconv_config[3].channel_round = 1;
f16_dwconv_config[3].primary_tile = 25;
}
#endif
}
static void init_f32_dwconv_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__neon;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 8,
f32_dwconv_config[0].channel_subtile = 8,
f32_dwconv_config[0].channel_round = 1,
f32_dwconv_config[0].primary_tile = 3,
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__neon;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 8,
f32_dwconv_config[1].channel_subtile = 8,
f32_dwconv_config[1].channel_round = 1,
f32_dwconv_config[1].primary_tile = 4,
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neon;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 4;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 8;
f32_dwconv_config[3].middle_tile = 8;
f32_dwconv_config[3].last_tile = 9;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
} else if (!XNN_PLATFORM_MOBILE) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 1;
f32_dwconv_config[0].channel_subtile = 1;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 1;
f32_dwconv_config[1].channel_subtile = 1;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 1;
f32_dwconv_config[2].channel_subtile = 1;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_2f2m2l4c1s1r__scalar_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 4;
f32_dwconv_config[3].channel_subtile = 1;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 2;
f32_dwconv_config[3].middle_tile = 2;
f32_dwconv_config[3].last_tile = 2;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2;
f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
}
#elif XNN_ARCH_ARM64
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 8;
f32_dwconv_config[0].channel_subtile = 8;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 8;
f32_dwconv_config[1].channel_subtile = 8;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
#if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC || XNN_PLATFORM_WINDOWS
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
switch (cpuinfo_get_core(0)->uarch) {
case cpuinfo_uarch_kryo:
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
break;
#if XNN_ENABLE_ASSEMBLY
case cpuinfo_uarch_cortex_a53:
case cpuinfo_uarch_cortex_a55r0:
case cpuinfo_uarch_cortex_a55:
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 4;
f32_dwconv_config[2].channel_subtile = 4;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
break;
#endif // XNN_ENABLE_ASSEMBLY
default:
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
break;
}
#endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 16;
f32_dwconv_config[0].channel_subtile = 16;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 16;
f32_dwconv_config[1].channel_subtile = 16;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 16;
f32_dwconv_config[2].channel_subtile = 16;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 16;
f32_dwconv_config[3].channel_subtile = 16;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 16;
f32_dwconv_config[3].channel_subtile = 16;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
} else if (hardware_config->use_x86_fma3) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[0].channel_tile = 16;
f32_dwconv_config[0].channel_subtile = 16;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__fma3;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[1].channel_tile = 16;
f32_dwconv_config[1].channel_subtile = 16;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__fma3;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[2].channel_tile = 16;
f32_dwconv_config[2].channel_subtile = 16;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l8c8s4r__fma3;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__fma3;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
} else if (hardware_config->use_x86_avx) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[0].channel_tile = 16;
f32_dwconv_config[0].channel_subtile = 16;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[1].channel_tile = 16;
f32_dwconv_config[1].channel_subtile = 16;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[2].channel_tile = 16;
f32_dwconv_config[2].channel_subtile = 16;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_6f6m7l8c8s4r__avx_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 6;
f32_dwconv_config[3].middle_tile = 6;
f32_dwconv_config[3].last_tile = 7;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__avx;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_avx_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
} else {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__sse;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[0].channel_tile = 8;
f32_dwconv_config[0].channel_subtile = 8;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__sse;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[1].channel_tile = 8;
f32_dwconv_config[1].channel_subtile = 8;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__sse;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_8f8m9l16c4s4r__sse_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[3].channel_tile = 16;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 8;
f32_dwconv_config[3].middle_tile = 8;
f32_dwconv_config[3].last_tile = 9;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__sse;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_sse_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
}
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#if XNN_ARCH_WASMRELAXEDSIMD
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[0].channel_tile = 8;
f32_dwconv_config[0].channel_subtile = 8;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[1].channel_tile = 8;
f32_dwconv_config[1].channel_subtile = 8;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#else
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->is_x86) {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p8c__wasmsimd;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[0].channel_tile = 8;
f32_dwconv_config[0].channel_subtile = 8;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p8c__wasmsimd;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[1].channel_tile = 8;
f32_dwconv_config[1].channel_subtile = 8;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[2].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86;
f32_dwconv_config[2].linear.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[2].channel_tile = 4;
f32_dwconv_config[2].channel_subtile = 4;
f32_dwconv_config[2].channel_round = 4;
f32_dwconv_config[2].primary_tile = 3;
f32_dwconv_config[2].middle_tile = 3;
f32_dwconv_config[2].last_tile = 3;
#else
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p8c__wasmsimd;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[2].channel_tile = 8;
f32_dwconv_config[2].channel_subtile = 8;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
} else {
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p4c__wasmsimd;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[0].channel_tile = 4;
f32_dwconv_config[0].channel_subtile = 4;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p4c__wasmsimd;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[1].channel_tile = 4;
f32_dwconv_config[1].channel_subtile = 4;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[2].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm;
f32_dwconv_config[2].linear.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[2].channel_tile = 4;
f32_dwconv_config[2].channel_subtile = 4;
f32_dwconv_config[2].channel_round = 4;
f32_dwconv_config[2].primary_tile = 3;
f32_dwconv_config[2].middle_tile = 3;
f32_dwconv_config[2].last_tile = 3;
#else
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p4c__wasmsimd;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[2].channel_tile = 4;
f32_dwconv_config[2].channel_subtile = 4;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
}
#endif
#if XNN_ARCH_WASMRELAXEDSIMD
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma;
f32_dwconv_config[3].linear.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmrelaxedsimd_fma;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[3].channel_tile = 4;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[3].channel_tile = 8;
f32_dwconv_config[3].channel_subtile = 8;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
#else
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm;
f32_dwconv_config[3].linear.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[3].channel_tile = 4;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 4;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm;
f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p4c__wasmsimd;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
f32_dwconv_config[3].channel_tile = 4;
f32_dwconv_config[3].channel_subtile = 4;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
#endif
#elif XNN_ARCH_WASM
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__wasm_acc2;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 1;
f32_dwconv_config[0].channel_subtile = 1;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__wasm_acc2;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 1;
f32_dwconv_config[1].channel_subtile = 1;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__wasm_acc2;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 1;
f32_dwconv_config[2].channel_subtile = 1;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
#if XNN_ENABLE_DWCONV_MULTIPASS
f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l1c1s1r__wasm;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 1;
f32_dwconv_config[3].channel_subtile = 1;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 5;
f32_dwconv_config[3].middle_tile = 5;
f32_dwconv_config[3].last_tile = 5;
#else
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__wasm_acc2;
f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 1;
f32_dwconv_config[3].channel_subtile = 1;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif // XNN_ENABLE_DWCONV_MULTIPASS
#elif XNN_ARCH_RISCV
f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2;
f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2;
f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[0].channel_tile = 1;
f32_dwconv_config[0].channel_subtile = 1;
f32_dwconv_config[0].channel_round = 1;
f32_dwconv_config[0].primary_tile = 3;
f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p1c__scalar_acc2;
f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p1c__scalar_acc2;
f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[1].channel_tile = 1;
f32_dwconv_config[1].channel_subtile = 1;
f32_dwconv_config[1].channel_round = 1;
f32_dwconv_config[1].primary_tile = 4;
f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p1c__scalar_acc2;
f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p1c__scalar_acc2;
f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[2].channel_tile = 1;
f32_dwconv_config[2].channel_subtile = 1;
f32_dwconv_config[2].channel_round = 1;
f32_dwconv_config[2].primary_tile = 9;
f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p1c__scalar_acc2;
f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p1c__scalar_acc2;
f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
f32_dwconv_config[3].channel_tile = 1;
f32_dwconv_config[3].channel_subtile = 1;
f32_dwconv_config[3].channel_round = 1;
f32_dwconv_config[3].primary_tile = 25;
#endif
}
static void init_qs8_qc8w_dwconv_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon) {
if (hardware_config->use_arm_neon_v8) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neonv8_mla8_ld64;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[2].channel_tile = 8;
qs8_qc8w_dwconv_config[2].channel_subtile = 8;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
} else {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
qs8_qc8w_dwconv_config[2].channel_tile = 8;
qs8_qc8w_dwconv_config[2].channel_subtile = 8;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
}
} else {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p1c__scalar_fmagic;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[0].channel_tile = 1;
qs8_qc8w_dwconv_config[0].channel_subtile = 1;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[1].channel_tile = 1;
qs8_qc8w_dwconv_config[1].channel_subtile = 1;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[2].channel_tile = 1;
qs8_qc8w_dwconv_config[2].channel_subtile = 1;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
}
#elif XNN_ARCH_ARM64
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neonv8_mla8_ld128;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neonv8_mla8_ld64;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neonv8_mla8_ld64;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
qs8_qc8w_dwconv_config[2].channel_tile = 16;
qs8_qc8w_dwconv_config[2].channel_subtile = 16;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
qs8_qc8w_dwconv_config[0].channel_tile = 32;
qs8_qc8w_dwconv_config[0].channel_subtile = 32;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
qs8_qc8w_dwconv_config[1].channel_tile = 32;
qs8_qc8w_dwconv_config[1].channel_subtile = 32;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
qs8_qc8w_dwconv_config[2].channel_tile = 32;
qs8_qc8w_dwconv_config[2].channel_subtile = 32;
qs8_qc8w_dwconv_config[2].channel_round = 1;
} else if (hardware_config->use_x86_xop) {
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__xop_mul16_add16;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__xop_mul16_add16;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[2].channel_tile = 16;
qs8_qc8w_dwconv_config[2].channel_subtile = 16;
qs8_qc8w_dwconv_config[2].channel_round = 1;
} else if (hardware_config->use_x86_avx2) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx2_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx2_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx2_params;
qs8_qc8w_dwconv_config[2].channel_tile = 16;
qs8_qc8w_dwconv_config[2].channel_subtile = 16;
qs8_qc8w_dwconv_config[2].channel_round = 1;
} else if (hardware_config->use_x86_avx) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx_mul16_add16;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[2].channel_tile = 16;
qs8_qc8w_dwconv_config[2].channel_subtile = 16;
qs8_qc8w_dwconv_config[2].channel_round = 1;
} else if (hardware_config->use_x86_sse4_1) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse41_mul16;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[0].channel_tile = 8;
qs8_qc8w_dwconv_config[0].channel_subtile = 8;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[1].channel_tile = 8;
qs8_qc8w_dwconv_config[1].channel_subtile = 8;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
qs8_qc8w_dwconv_config[2].channel_tile = 8;
qs8_qc8w_dwconv_config[2].channel_subtile = 8;
qs8_qc8w_dwconv_config[2].channel_round = 1;
} else {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p8c__sse2_mul16;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse2_params;
qs8_qc8w_dwconv_config[0].channel_tile = 8;
qs8_qc8w_dwconv_config[0].channel_subtile = 8;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse2_params;
qs8_qc8w_dwconv_config[1].channel_tile = 8;
qs8_qc8w_dwconv_config[1].channel_subtile = 8;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse2_params;
qs8_qc8w_dwconv_config[2].channel_tile = 8;
qs8_qc8w_dwconv_config[2].channel_subtile = 8;
qs8_qc8w_dwconv_config[2].channel_round = 1;
}
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_wasmsimd_params;
qs8_qc8w_dwconv_config[0].channel_tile = 16;
qs8_qc8w_dwconv_config[0].channel_subtile = 16;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_wasmsimd_params;
qs8_qc8w_dwconv_config[1].channel_tile = 16;
qs8_qc8w_dwconv_config[1].channel_subtile = 16;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_wasmsimd_params;
qs8_qc8w_dwconv_config[2].channel_tile = 16;
qs8_qc8w_dwconv_config[2].channel_subtile = 16;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
#elif XNN_ARCH_WASM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->is_x86) {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_imagic;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_imagic_params;
qs8_qc8w_dwconv_config[0].channel_tile = 2;
qs8_qc8w_dwconv_config[0].channel_subtile = 2;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_imagic_params;
qs8_qc8w_dwconv_config[1].channel_tile = 2;
qs8_qc8w_dwconv_config[1].channel_subtile = 2;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_imagic_params;
qs8_qc8w_dwconv_config[2].channel_tile = 1;
qs8_qc8w_dwconv_config[2].channel_subtile = 1;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
} else {
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__wasm_fmagic;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[0].channel_tile = 2;
qs8_qc8w_dwconv_config[0].channel_subtile = 2;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[1].channel_tile = 2;
qs8_qc8w_dwconv_config[1].channel_subtile = 2;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_fmagic_params;
qs8_qc8w_dwconv_config[2].channel_tile = 2;
qs8_qc8w_dwconv_config[2].channel_subtile = 2;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
}
#elif XNN_ARCH_RISCV
qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p2c__scalar_lrintf;
qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_lrintf_params;
qs8_qc8w_dwconv_config[0].channel_tile = 2;
qs8_qc8w_dwconv_config[0].channel_subtile = 2;
qs8_qc8w_dwconv_config[0].channel_round = 1;
qs8_qc8w_dwconv_config[0].primary_tile = 3;
qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_lrintf_params;
qs8_qc8w_dwconv_config[1].channel_tile = 2;
qs8_qc8w_dwconv_config[1].channel_subtile = 2;
qs8_qc8w_dwconv_config[1].channel_round = 1;
qs8_qc8w_dwconv_config[1].primary_tile = 9;
qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_lrintf_params;
qs8_qc8w_dwconv_config[2].channel_tile = 2;
qs8_qc8w_dwconv_config[2].channel_subtile = 2;
qs8_qc8w_dwconv_config[2].channel_round = 1;
qs8_qc8w_dwconv_config[2].primary_tile = 25;
#endif
}
static void init_qs8_dwconv_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
qs8_dwconv_config[1].channel_tile = 8;
qs8_dwconv_config[1].channel_subtile = 8;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
} else {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
qs8_dwconv_config[0].channel_tile = 1;
qs8_dwconv_config[0].channel_subtile = 1;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
qs8_dwconv_config[1].channel_tile = 1;
qs8_dwconv_config[1].channel_subtile = 1;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
}
#elif XNN_ARCH_ARM64
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
qs8_dwconv_config[1].channel_tile = 16;
qs8_dwconv_config[1].channel_subtile = 16;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
qs8_dwconv_config[0].channel_tile = 32;
qs8_dwconv_config[0].channel_subtile = 32;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
qs8_dwconv_config[1].channel_tile = 32;
qs8_dwconv_config[1].channel_subtile = 32;
qs8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_xop) {
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__xop_mul16_add16;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[1].channel_tile = 16;
qs8_dwconv_config[1].channel_subtile = 16;
qs8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_avx2) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
qs8_dwconv_config[1].channel_tile = 16;
qs8_dwconv_config[1].channel_subtile = 16;
qs8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_avx) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16_add16;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16_add16;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[1].channel_tile = 16;
qs8_dwconv_config[1].channel_subtile = 16;
qs8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_sse4_1) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16_add16;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[0].channel_tile = 8;
qs8_dwconv_config[0].channel_subtile = 8;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16_add16;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
qs8_dwconv_config[1].channel_tile = 8;
qs8_dwconv_config[1].channel_subtile = 8;
qs8_dwconv_config[1].channel_round = 1;
} else {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16_add16;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
qs8_dwconv_config[0].channel_tile = 8;
qs8_dwconv_config[0].channel_subtile = 8;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16_add16;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
qs8_dwconv_config[1].channel_tile = 8;
qs8_dwconv_config[1].channel_subtile = 8;
qs8_dwconv_config[1].channel_round = 1;
}
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
qs8_dwconv_config[0].channel_tile = 16;
qs8_dwconv_config[0].channel_subtile = 16;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
qs8_dwconv_config[1].channel_tile = 16;
qs8_dwconv_config[1].channel_subtile = 16;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_WASM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->is_x86) {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
qs8_dwconv_config[0].channel_tile = 2;
qs8_dwconv_config[0].channel_subtile = 2;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
qs8_dwconv_config[1].channel_tile = 1;
qs8_dwconv_config[1].channel_subtile = 1;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
} else {
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
qs8_dwconv_config[0].channel_tile = 2;
qs8_dwconv_config[0].channel_subtile = 2;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
qs8_dwconv_config[1].channel_tile = 2;
qs8_dwconv_config[1].channel_subtile = 2;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
}
#elif XNN_ARCH_RISCV
qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
qs8_dwconv_config[0].channel_tile = 2;
qs8_dwconv_config[0].channel_subtile = 2;
qs8_dwconv_config[0].channel_round = 1;
qs8_dwconv_config[0].primary_tile = 9;
qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
qs8_dwconv_config[1].channel_tile = 2;
qs8_dwconv_config[1].channel_subtile = 2;
qs8_dwconv_config[1].channel_round = 1;
qs8_dwconv_config[1].primary_tile = 25;
#endif
}
static void init_qu8_dwconv_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
qu8_dwconv_config[0].channel_tile = 16;
qu8_dwconv_config[0].channel_subtile = 16;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
qu8_dwconv_config[1].channel_tile = 8;
qu8_dwconv_config[1].channel_subtile = 8;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
} else {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p1c__scalar_fmagic;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
qu8_dwconv_config[0].channel_tile = 1;
qu8_dwconv_config[0].channel_subtile = 1;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_fmagic;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
qu8_dwconv_config[1].channel_tile = 1;
qu8_dwconv_config[1].channel_subtile = 1;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
}
#elif XNN_ARCH_ARM64
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
qu8_dwconv_config[0].channel_tile = 16;
qu8_dwconv_config[0].channel_subtile = 16;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
qu8_dwconv_config[1].channel_tile = 8;
qu8_dwconv_config[1].channel_subtile = 8;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
qu8_dwconv_config[0].channel_tile = 32;
qu8_dwconv_config[0].channel_subtile = 32;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
qu8_dwconv_config[1].channel_tile = 32;
qu8_dwconv_config[1].channel_subtile = 32;
qu8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_xop) {
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul32;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[0].channel_tile = 16;
qu8_dwconv_config[0].channel_subtile = 16;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__xop_mul32;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[1].channel_tile = 16;
qu8_dwconv_config[1].channel_subtile = 16;
qu8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_avx2) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
qu8_dwconv_config[0].channel_tile = 16;
qu8_dwconv_config[0].channel_subtile = 16;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx2_mul32;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
qu8_dwconv_config[1].channel_tile = 16;
qu8_dwconv_config[1].channel_subtile = 16;
qu8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_avx) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx_mul16;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[0].channel_tile = 16;
qu8_dwconv_config[0].channel_subtile = 16;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p16c__avx_mul16;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[1].channel_tile = 16;
qu8_dwconv_config[1].channel_subtile = 16;
qu8_dwconv_config[1].channel_round = 1;
} else if (hardware_config->use_x86_sse4_1) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse41_mul16;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[0].channel_tile = 8;
qu8_dwconv_config[0].channel_subtile = 8;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse41_mul16;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[1].channel_tile = 8;
qu8_dwconv_config[1].channel_subtile = 8;
qu8_dwconv_config[1].channel_round = 1;
} else {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__sse2_mul16;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[0].channel_tile = 8;
qu8_dwconv_config[0].channel_subtile = 8;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__sse2_mul16;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
qu8_dwconv_config[1].channel_tile = 8;
qu8_dwconv_config[1].channel_subtile = 8;
qu8_dwconv_config[1].channel_round = 1;
}
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
qu8_dwconv_config[0].channel_tile = 8;
qu8_dwconv_config[0].channel_subtile = 8;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
qu8_dwconv_config[1].channel_tile = 8;
qu8_dwconv_config[1].channel_subtile = 8;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
#elif XNN_ARCH_WASM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->is_x86) {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_imagic;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
qu8_dwconv_config[0].channel_tile = 2;
qu8_dwconv_config[0].channel_subtile = 2;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p1c__scalar_imagic;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
qu8_dwconv_config[1].channel_tile = 1;
qu8_dwconv_config[1].channel_subtile = 1;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
} else {
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__wasm_fmagic;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
qu8_dwconv_config[0].channel_tile = 2;
qu8_dwconv_config[0].channel_subtile = 2;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__wasm_fmagic;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
qu8_dwconv_config[1].channel_tile = 2;
qu8_dwconv_config[1].channel_subtile = 2;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
}
#elif XNN_ARCH_RISCV
qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p2c__scalar_lrintf;
qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
qu8_dwconv_config[0].channel_tile = 2;
qu8_dwconv_config[0].channel_subtile = 2;
qu8_dwconv_config[0].channel_round = 1;
qu8_dwconv_config[0].primary_tile = 9;
qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p2c__scalar_lrintf;
qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
qu8_dwconv_config[1].channel_tile = 2;
qu8_dwconv_config[1].channel_subtile = 2;
qu8_dwconv_config[1].channel_round = 1;
qu8_dwconv_config[1].primary_tile = 25;
#endif
}
#if XNN_PLATFORM_WINDOWS
static BOOL CALLBACK init_f16_dwconv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
init_f16_dwconv_config();
return TRUE;
}
static BOOL CALLBACK init_f32_dwconv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
init_f32_dwconv_config();
return TRUE;
}
static BOOL CALLBACK init_qs8_qc8w_dwconv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
init_qs8_qc8w_dwconv_config();
return TRUE;
}
static BOOL CALLBACK init_qs8_dwconv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
init_qs8_dwconv_config();
return TRUE;
}
static BOOL CALLBACK init_qu8_dwconv_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
init_qu8_dwconv_config();
return TRUE;
}
#endif
struct xnn_dwconv_config* xnn_init_f16_dwconv_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) {
return NULL;
}
#if XNN_PLATFORM_WINDOWS
InitOnceExecuteOnce(&init_guard_f16_dwconv, &init_f16_dwconv_config_windows, NULL, NULL);
#else
pthread_once(&init_guard_f16_dwconv, &init_f16_dwconv_config);
#endif
return f16_dwconv_config;
}
struct xnn_dwconv_config* xnn_init_f32_dwconv_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
return NULL;
}
#if XNN_PLATFORM_WINDOWS
InitOnceExecuteOnce(&init_guard_f32_dwconv, &init_f32_dwconv_config_windows, NULL, NULL);
#else
pthread_once(&init_guard_f32_dwconv, &init_f32_dwconv_config);
#endif
return f32_dwconv_config;
}
struct xnn_dwconv_config* xnn_init_qs8_qc8w_dwconv_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
return NULL;
}
#if XNN_PLATFORM_WINDOWS
InitOnceExecuteOnce(&init_guard_qs8_qc8w_dwconv, &init_qs8_qc8w_dwconv_config_windows, NULL, NULL);
#else
pthread_once(&init_guard_qs8_qc8w_dwconv, &init_qs8_qc8w_dwconv_config);
#endif
return qs8_qc8w_dwconv_config;
}
struct xnn_dwconv_config* xnn_init_qs8_dwconv_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
return NULL;
}
#if XNN_PLATFORM_WINDOWS
InitOnceExecuteOnce(&init_guard_qs8_dwconv, &init_qs8_dwconv_config_windows, NULL, NULL);
#else
pthread_once(&init_guard_qs8_dwconv, &init_qs8_dwconv_config);
#endif
return qs8_dwconv_config;
}
struct xnn_dwconv_config* xnn_init_qu8_dwconv_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
return NULL;
}
#if XNN_PLATFORM_WINDOWS
InitOnceExecuteOnce(&init_guard_qu8_dwconv, &init_qu8_dwconv_config_windows, NULL, NULL);
#else
pthread_once(&init_guard_qu8_dwconv, &init_qu8_dwconv_config);
#endif
return qu8_dwconv_config;
}