// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>

#include <arm_neon.h>

#include <xnnpack/argmaxpool.h>
#include <xnnpack/avgpool.h>
#include <xnnpack/common.h>
#include <xnnpack/conv.h>
#include <xnnpack/dwconv.h>
#include <xnnpack/fill.h>
#include <xnnpack/gavgpool.h>
#include <xnnpack/gemm.h>
#include <xnnpack/ibilinear.h>
#include <xnnpack/igemm.h>
#include <xnnpack/intrinsics-polyfill.h>
#include <xnnpack/math-stubs.h>
#include <xnnpack/math.h>
#include <xnnpack/maxpool.h>
#include <xnnpack/microparams.h>
#include <xnnpack/packw.h>
#include <xnnpack/packx.h>
#include <xnnpack/pad.h>
#include <xnnpack/pavgpool.h>
#include <xnnpack/prefetch.h>
#include <xnnpack/prelu.h>
#include <xnnpack/raddstoreexpminusmax.h>
#include <xnnpack/reduce.h>
#include <xnnpack/rmax.h>
#include <xnnpack/spmm.h>
#include <xnnpack/transpose.h>
#include <xnnpack/unpool.h>
#include <xnnpack/vadd.h>
#include <xnnpack/vbinary.h>
#include <xnnpack/vcvt.h>
#include <xnnpack/vhswish.h>
#include <xnnpack/vlrelu.h>
#include <xnnpack/vmul.h>
#include <xnnpack/vmulcaddc.h>
#include <xnnpack/vunary.h>
#include <xnnpack/zip.h>


void xnn_f16_f32_vcvt_ukernel__neon_int16_x16(
    size_t batch,
    const void* input,
    float* output,
    const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint16_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vsign_mask = vmovq_n_u16(0x8000);
  const uint16x8_t vexp_offset = vmovq_n_u16(0x7000);
  const float32x4_t vexp_scale = vld1q_dup_f32(&params->neon.exp_scale);
  const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000);
  const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400);

  const uint16_t* i = (const uint16_t*) input;
  for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
    const uint16x8_t vh0 = vld1q_u16(i); i += 8;
    const uint16x8_t vh1 = vld1q_u16(i); i += 8;

    const uint16x8_t vsign0 = vandq_u16(vh0, vsign_mask);
    const uint16x8_t vsign1 = vandq_u16(vh1, vsign_mask);

    const uint16x8_t vnonsign0 = veorq_u16(vh0, vsign0);
    const uint16x8_t vnonsign1 = veorq_u16(vh1, vsign1);

    const uint16x8x2_t vprenorm0 = vzipq_u16(vshlq_n_u16(vnonsign0, 13), vsraq_n_u16(vexp_offset, vnonsign0, 3));
    const uint16x8x2_t vprenorm1 = vzipq_u16(vshlq_n_u16(vnonsign1, 13), vsraq_n_u16(vexp_offset, vnonsign1, 3));

    const float32x4_t vnorm0 = vmulq_f32(vreinterpretq_f32_u16(vprenorm0.val[0]), vexp_scale);
    const float32x4_t vnorm1 = vmulq_f32(vreinterpretq_f32_u16(vprenorm0.val[1]), vexp_scale);
    const float32x4_t vnorm2 = vmulq_f32(vreinterpretq_f32_u16(vprenorm1.val[0]), vexp_scale);
    const float32x4_t vnorm3 = vmulq_f32(vreinterpretq_f32_u16(vprenorm1.val[1]), vexp_scale);

    const float32x4_t vdenorm0 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign0))), vreinterpretq_f32_u32(vmagic_bias));
    const float32x4_t vdenorm1 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign0))), vreinterpretq_f32_u32(vmagic_bias));
    const float32x4_t vdenorm2 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign1))), vreinterpretq_f32_u32(vmagic_bias));
    const float32x4_t vdenorm3 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign1))), vreinterpretq_f32_u32(vmagic_bias));

    const uint16x8_t vmask0 = vcgtq_u16(vnonsign0, vdenorm_cutoff);
    const uint16x8_t vmask1 = vcgtq_u16(vnonsign1, vdenorm_cutoff);

    const uint32x4_t vxmask0 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask0))));
    const uint32x4_t vf0 = vorrq_u32(vshll_n_u16(vget_low_u16(vsign0), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask0, vnorm0, vdenorm0)));
    const uint32x4_t vxmask2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask1))));
    const uint32x4_t vf2 = vorrq_u32(vshll_n_u16(vget_low_u16(vsign1), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask2, vnorm2, vdenorm2)));

    const uint32x4_t vxmask1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask0))));
    const uint32x4_t vf1 = vorrq_u32(vshll_n_u16(vget_high_u16(vsign0), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask1, vnorm1, vdenorm1)));
    const uint32x4_t vxmask3 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask1))));
    const uint32x4_t vf3 = vorrq_u32(vshll_n_u16(vget_high_u16(vsign1), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask3, vnorm3, vdenorm3)));

    vst1q_f32(output, vreinterpretq_f32_u32(vf0)); output += 4;
    vst1q_f32(output, vreinterpretq_f32_u32(vf1)); output += 4;
    vst1q_f32(output, vreinterpretq_f32_u32(vf2)); output += 4;
    vst1q_f32(output, vreinterpretq_f32_u32(vf3)); output += 4;
  }
  for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
    const uint16x8_t vh = vld1q_u16(i); i += 8;

    const uint16x8_t vsign = vandq_u16(vh, vsign_mask);

    const uint16x8_t vnonsign = veorq_u16(vh, vsign);

    const uint16x8x2_t vprenorm = vzipq_u16(vshlq_n_u16(vnonsign, 13), vsraq_n_u16(vexp_offset, vnonsign, 3));
    const float32x4_t vnorm_lo = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[0]), vexp_scale);
    const float32x4_t vnorm_hi = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[1]), vexp_scale);

    const float32x4_t vdenorm_lo = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias));
    const float32x4_t vdenorm_hi = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias));

    const uint16x8_t vmask = vcgtq_u16(vnonsign, vdenorm_cutoff);

    const uint32x4_t vxmask_lo = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask))));
    const uint32x4_t vf_lo = vorrq_u32(vshll_n_u16(vget_low_u16(vsign), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask_lo, vnorm_lo, vdenorm_lo)));

    const uint32x4_t vxmask_hi = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask))));
    const uint32x4_t vf_hi = vorrq_u32(vshll_n_u16(vget_high_u16(vsign), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask_hi, vnorm_hi, vdenorm_hi)));

    vst1q_f32(output, vreinterpretq_f32_u32(vf_lo)); output += 4;
    vst1q_f32(output, vreinterpretq_f32_u32(vf_hi)); output += 4;
  }
  if XNN_UNPREDICTABLE(batch != 0) {
    const uint16x8_t vh = vld1q_u16(i); i += 8;

    const uint16x8_t vsign = vandq_u16(vh, vsign_mask);

    const uint16x8_t vnonsign = veorq_u16(vh, vsign);

    const uint16x8x2_t vprenorm = vzipq_u16(vshlq_n_u16(vnonsign, 13), vsraq_n_u16(vexp_offset, vnonsign, 3));
    const float32x4_t vnorm_lo = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[0]), vexp_scale);
    const float32x4_t vnorm_hi = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[1]), vexp_scale);

    const float32x4_t vdenorm_lo = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias));
    const float32x4_t vdenorm_hi = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias));

    const uint16x8_t vmask = vcgtq_u16(vnonsign, vdenorm_cutoff);

    const uint32x4_t vxmask_lo = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask))));
    uint32x4_t vf = vorrq_u32(vshll_n_u16(vget_low_u16(vsign), 16),
      vreinterpretq_u32_f32(vbslq_f32(vxmask_lo, vnorm_lo, vdenorm_lo)));

    if (batch & (4 * sizeof(uint16_t))) {
      vst1q_f32(output, vreinterpretq_f32_u32(vf)); output += 4;

      const uint32x4_t vxmask_hi = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask))));
      vf = vorrq_u32(vshll_n_u16(vget_high_u16(vsign), 16),
        vreinterpretq_u32_f32(vbslq_f32(vxmask_hi, vnorm_hi, vdenorm_hi)));
    }
    uint32x2_t vf_lo = vget_low_u32(vf);
    if (batch & (2 * sizeof(uint16_t))) {
      vst1_f32(output, vreinterpret_f32_u32(vf_lo)); output += 2;
      vf_lo = vget_high_u32(vf);
    }
    if (batch & (1 * sizeof(uint16_t))) {
      vst1_lane_f32(output, vreinterpret_f32_u32(vf_lo), 0);
    }
  }
}

void xnn_f32_argmaxpool_ukernel_4x__neon_c4(
    size_t output_pixels,
    size_t pooling_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    float* output,
    uint32_t* index,
    size_t input_increment,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(pooling_elements != 0);
  assert(pooling_elements <= 4);
  assert(channels != 0);

  do {
    const float* i0 = input[0];
    const float* i1 = input[1];
    const float* i2 = input[2];
    const float* i3 = input[3];
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
    if (pooling_elements < 2) {
      i1 = i0;
    }
    if (pooling_elements <= 2) {
      i2 = i0;
    }
    if (pooling_elements != 4) {
      i3 = i0;
    }

    size_t c = channels;
    for (; c >= 4; c -= 4) {
      const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;

      float32x4_t vmax = vi0;
      uint32x4_t vidx = vmovq_n_u32(0);

      const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
      vmax = vbslq_f32(vm1, vi1, vmax);
      vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx);

      const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
      vmax = vbslq_f32(vm2, vi2, vmax);
      vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx);

      const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
      vmax = vbslq_f32(vm3, vi3, vmax);
      vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx);

      vst1q_f32(output, vmax); output += 4;
      vst1q_u32(index, vidx); index += 4;
    }
    if (c != 0) {
      const float32x4_t vi0 = vld1q_f32(i0);
      const float32x4_t vi1 = vld1q_f32(i1);
      const float32x4_t vi2 = vld1q_f32(i2);
      const float32x4_t vi3 = vld1q_f32(i3);

      float32x4_t vmax = vi0;
      uint32x4_t vidx = vmovq_n_u32(0);

      const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
      vmax = vbslq_f32(vm1, vi1, vmax);
      vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx);

      const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
      vmax = vbslq_f32(vm2, vi2, vmax);
      vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx);

      const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
      vmax = vbslq_f32(vm3, vi3, vmax);
      vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx);

      float32x2_t vmax_lo = vget_low_f32(vmax);
      uint32x2_t vidx_lo = vget_low_u32(vidx);
      if (c & 2) {
        vst1_f32(output, vmax_lo); output += 2;
        vst1_u32(index, vidx_lo); index += 2;
        vmax_lo = vget_high_f32(vmax);
        vidx_lo = vget_high_u32(vidx);
      }
      if (c & 1) {
        vst1_lane_f32(output, vmax_lo, 0); output += 1;
        vst1_lane_u32(index, vidx_lo, 0); index += 1;
      }
    }
    input = (const float**) ((uintptr_t) input + input_increment);
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_argmaxpool_ukernel_9p8x__neon_c4(
    size_t output_pixels,
    size_t pooling_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    float* accumulation_buffer,
    uint32_t* index_buffer,
    float* output,
    uint32_t* index,
    size_t input_increment,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(pooling_elements != 0);
  assert(pooling_elements > 9);
  assert(channels != 0);

  do {
    {
      float* ab = accumulation_buffer;
      uint32_t* ib = index_buffer;

      const float* i0 = *input++;
      const float* i1 = *input++;
      const float* i2 = *input++;
      const float* i3 = *input++;
      const float* i4 = *input++;
      const float* i5 = *input++;
      const float* i6 = *input++;
      const float* i7 = *input++;
      const float* i8 = *input++;
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
      i8 = (const float*) ((uintptr_t) i8 + input_offset);

      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

        float32x4_t vmax = vi0;
        uint32x4_t vidx = vmovq_n_u32(0);

        const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
        vmax = vbslq_f32(vm1, vi1, vmax);
        vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx);

        const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
        vmax = vbslq_f32(vm2, vi2, vmax);
        vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx);

        const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
        vmax = vbslq_f32(vm3, vi3, vmax);
        vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx);

        const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
        vmax = vbslq_f32(vm4, vi4, vmax);
        vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx);

        const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
        vmax = vbslq_f32(vm5, vi5, vmax);
        vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx);

        const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
        vmax = vbslq_f32(vm6, vi6, vmax);
        vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx);

        const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
        vmax = vbslq_f32(vm7, vi7, vmax);
        vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx);

        const uint32x4_t vm8 = vcgtq_f32(vi8, vmax);
        vmax = vbslq_f32(vm8, vi8, vmax);
        vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx);

        vst1q_f32(ab, vmax); ab += 4;
        vst1q_u32(ib, vidx); ib += 4;
      }
    }
    const uint32x4_t v1 = vmovq_n_u32(1);
    const uint32x4_t v8 = vmovq_n_u32(8);
    uint32x4_t vidx0 = vaddq_u32(v1, v8);

    size_t k = pooling_elements;
    for (k -= 9; k > 8; k -= 8) {
      const float* i0 = *input++;
      const float* i1 = *input++;
      const float* i2 = *input++;
      const float* i3 = *input++;
      const float* i4 = *input++;
      const float* i5 = *input++;
      const float* i6 = *input++;
      const float* i7 = *input++;
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
      i7 = (const float*) ((uintptr_t) i7 + input_offset);

      float* ab = accumulation_buffer;
      uint32_t* ib = index_buffer;

      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;

        float32x4_t vmax = vld1q_f32(ab);
        uint32x4_t vidx = vld1q_u32(ib);

        const uint32x4_t vm0 = vcgtq_f32(vi0, vmax);
        vmax = vbslq_f32(vm0, vi0, vmax);
        vidx = vbslq_u32(vm0, vidx0, vidx);

        const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
        const uint32x4_t vidx1 = vaddq_u32(vidx0, v1);
        vmax = vbslq_f32(vm1, vi1, vmax);
        vidx = vbslq_u32(vm1, vidx1, vidx);

        const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
        const uint32x4_t vidx2 = vaddq_u32(vidx1, v1);
        vmax = vbslq_f32(vm2, vi2, vmax);
        vidx = vbslq_u32(vm2, vidx2, vidx);

        const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
        const uint32x4_t vidx3 = vaddq_u32(vidx2, v1);
        vmax = vbslq_f32(vm3, vi3, vmax);
        vidx = vbslq_u32(vm3, vidx3, vidx);

        const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
        const uint32x4_t vidx4 = vaddq_u32(vidx3, v1);
        vmax = vbslq_f32(vm4, vi4, vmax);
        vidx = vbslq_u32(vm4, vidx4, vidx);

        const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
        const uint32x4_t vidx5 = vaddq_u32(vidx4, v1);
        vmax = vbslq_f32(vm5, vi5, vmax);
        vidx = vbslq_u32(vm5, vidx5, vidx);

        const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
        const uint32x4_t vidx6 = vaddq_u32(vidx5, v1);
        vmax = vbslq_f32(vm6, vi6, vmax);
        vidx = vbslq_u32(vm6, vidx6, vidx);

        const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
        const uint32x4_t vidx7 = vaddq_u32(vidx6, v1);
        vmax = vbslq_f32(vm7, vi7, vmax);
        vidx = vbslq_u32(vm7, vidx7, vidx);

        vst1q_f32(ab, vmax); ab += 4;
        vst1q_u32(ib, vidx); ib += 4;
      }
      vidx0 = vaddq_u32(vidx0, v8);
    }

    float* o = output;
    uint32_t* i = index;
    {
      const float* i0 = input[0];
      const float* i1 = input[1];
      const float* i2 = input[2];
      const float* i3 = input[3];
      const float* i4 = input[4];
      const float* i5 = input[5];
      const float* i6 = input[6];
      const float* i7 = input[7];
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
      input = (const float**) ((uintptr_t) input + input_increment);
      if (k < 2) {
        i1 = i0;
      }
      if (k <= 2) {
        i2 = i0;
      }
      if (k < 4) {
        i3 = i0;
      }
      if (k <= 4) {
        i4 = i0;
      }
      if (k < 6) {
        i5 = i0;
      }
      if (k <= 6) {
        i6 = i0;
      }
      if (k != 8) {
        i7 = i0;
      }

      size_t c = channels;
      float* ab = accumulation_buffer;
      uint32_t* ib = index_buffer;
      for (; c >= 4; c -= 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;

        float32x4_t vmax = vld1q_f32(ab); ab += 4;
        uint32x4_t vidx = vld1q_u32(ib); ib += 4;

        const uint32x4_t vm0 = vcgtq_f32(vi0, vmax);
        vmax = vbslq_f32(vm0, vi0, vmax);
        vidx = vbslq_u32(vm0, vidx0, vidx);

        const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
        const uint32x4_t vidx1 = vaddq_u32(vidx0, v1);
        vmax = vbslq_f32(vm1, vi1, vmax);
        vidx = vbslq_u32(vm1, vidx1, vidx);

        const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
        const uint32x4_t vidx2 = vaddq_u32(vidx1, v1);
        vmax = vbslq_f32(vm2, vi2, vmax);
        vidx = vbslq_u32(vm2, vidx2, vidx);

        const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
        const uint32x4_t vidx3 = vaddq_u32(vidx2, v1);
        vmax = vbslq_f32(vm3, vi3, vmax);
        vidx = vbslq_u32(vm3, vidx3, vidx);

        const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
        const uint32x4_t vidx4 = vaddq_u32(vidx3, v1);
        vmax = vbslq_f32(vm4, vi4, vmax);
        vidx = vbslq_u32(vm4, vidx4, vidx);

        const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
        const uint32x4_t vidx5 = vaddq_u32(vidx4, v1);
        vmax = vbslq_f32(vm5, vi5, vmax);
        vidx = vbslq_u32(vm5, vidx5, vidx);

        const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
        const uint32x4_t vidx6 = vaddq_u32(vidx5, v1);
        vmax = vbslq_f32(vm6, vi6, vmax);
        vidx = vbslq_u32(vm6, vidx6, vidx);

        const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
        const uint32x4_t vidx7 = vaddq_u32(vidx6, v1);
        vmax = vbslq_f32(vm7, vi7, vmax);
        vidx = vbslq_u32(vm7, vidx7, vidx);

        vst1q_f32(o, vmax); o += 4;
        vst1q_u32(i, vidx); i += 4;
      }
      if (c != 0) {
        const float32x4_t vi0 = vld1q_f32(i0);
        const float32x4_t vi1 = vld1q_f32(i1);
        const float32x4_t vi2 = vld1q_f32(i2);
        const float32x4_t vi3 = vld1q_f32(i3);
        const float32x4_t vi4 = vld1q_f32(i4);
        const float32x4_t vi5 = vld1q_f32(i5);
        const float32x4_t vi6 = vld1q_f32(i6);
        const float32x4_t vi7 = vld1q_f32(i7);

        float32x4_t vmax = vld1q_f32(ab);
        uint32x4_t vidx = vld1q_u32(ib);

        const uint32x4_t vm0 = vcgtq_f32(vi0, vmax);
        vmax = vbslq_f32(vm0, vi0, vmax);
        vidx = vbslq_u32(vm0, vidx0, vidx);

        const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
        const uint32x4_t vidx1 = vaddq_u32(vidx0, v1);
        vmax = vbslq_f32(vm1, vi1, vmax);
        vidx = vbslq_u32(vm1, vidx1, vidx);

        const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
        const uint32x4_t vidx2 = vaddq_u32(vidx1, v1);
        vmax = vbslq_f32(vm2, vi2, vmax);
        vidx = vbslq_u32(vm2, vidx2, vidx);

        const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
        const uint32x4_t vidx3 = vaddq_u32(vidx2, v1);
        vmax = vbslq_f32(vm3, vi3, vmax);
        vidx = vbslq_u32(vm3, vidx3, vidx);

        const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
        const uint32x4_t vidx4 = vaddq_u32(vidx3, v1);
        vmax = vbslq_f32(vm4, vi4, vmax);
        vidx = vbslq_u32(vm4, vidx4, vidx);

        const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
        const uint32x4_t vidx5 = vaddq_u32(vidx4, v1);
        vmax = vbslq_f32(vm5, vi5, vmax);
        vidx = vbslq_u32(vm5, vidx5, vidx);

        const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
        const uint32x4_t vidx6 = vaddq_u32(vidx5, v1);
        vmax = vbslq_f32(vm6, vi6, vmax);
        vidx = vbslq_u32(vm6, vidx6, vidx);

        const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
        const uint32x4_t vidx7 = vaddq_u32(vidx6, v1);
        vmax = vbslq_f32(vm7, vi7, vmax);
        vidx = vbslq_u32(vm7, vidx7, vidx);

        float32x2_t vmax_lo = vget_low_f32(vmax);
        uint32x2_t vidx_lo = vget_low_u32(vidx);
        if (c & 2) {
          vst1_f32(o, vmax_lo); o += 2;
          vst1_u32(i, vidx_lo); i += 2;
          vmax_lo = vget_high_f32(vmax);
          vidx_lo = vget_high_u32(vidx);
        }
        if (c & 1) {
          vst1_lane_f32(o, vmax_lo, 0); o += 1;
          vst1_lane_u32(i, vidx_lo, 0); i += 1;
        }
      }
    }

    output = (float*) ((uintptr_t) o + output_increment);
    index = (uint32_t*) i;
  } while (--output_pixels != 0);
}

void xnn_f32_argmaxpool_ukernel_9x__neon_c4(
    size_t output_pixels,
    size_t pooling_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    float* output,
    uint32_t* index,
    size_t input_increment,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(pooling_elements != 0);
  assert(pooling_elements <= 9);
  assert(channels != 0);

  do {
    const float* i0 = input[0];
    const float* i1 = input[1];
    const float* i2 = input[2];
    const float* i3 = input[3];
    const float* i4 = input[4];
    const float* i5 = input[5];
    const float* i6 = input[6];
    const float* i7 = input[7];
    const float* i8 = input[8];
    i0 = (const float*) ((uintptr_t) i0 + input_offset);
    i1 = (const float*) ((uintptr_t) i1 + input_offset);
    i2 = (const float*) ((uintptr_t) i2 + input_offset);
    i3 = (const float*) ((uintptr_t) i3 + input_offset);
    i4 = (const float*) ((uintptr_t) i4 + input_offset);
    i5 = (const float*) ((uintptr_t) i5 + input_offset);
    i6 = (const float*) ((uintptr_t) i6 + input_offset);
    i7 = (const float*) ((uintptr_t) i7 + input_offset);
    i8 = (const float*) ((uintptr_t) i8 + input_offset);
    if (pooling_elements < 2) {
      i1 = i0;
    }
    if (pooling_elements <= 2) {
      i2 = i0;
    }
    if (pooling_elements < 4) {
      i3 = i0;
    }
    if (pooling_elements <= 4) {
      i4 = i0;
    }
    if (pooling_elements < 6) {
      i5 = i0;
    }
    if (pooling_elements <= 6) {
      i6 = i0;
    }
    if (pooling_elements < 8) {
      i7 = i0;
    }
    if (pooling_elements <= 8) {
      i8 = i0;
    }

    size_t c = channels;
    for (; c >= 4; c -= 4) {
      const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

      float32x4_t vmax = vi0;
      uint32x4_t vidx = vmovq_n_u32(0);

      const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
      vmax = vbslq_f32(vm1, vi1, vmax);
      vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx);

      const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
      vmax = vbslq_f32(vm2, vi2, vmax);
      vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx);

      const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
      vmax = vbslq_f32(vm3, vi3, vmax);
      vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx);

      const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
      vmax = vbslq_f32(vm4, vi4, vmax);
      vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx);

      const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
      vmax = vbslq_f32(vm5, vi5, vmax);
      vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx);

      const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
      vmax = vbslq_f32(vm6, vi6, vmax);
      vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx);

      const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
      vmax = vbslq_f32(vm7, vi7, vmax);
      vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx);

      const uint32x4_t vm8 = vcgtq_f32(vi8, vmax);
      vmax = vbslq_f32(vm8, vi8, vmax);
      vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx);

      vst1q_f32(output, vmax); output += 4;
      vst1q_u32(index, vidx); index += 4;
    }
    if (c != 0) {
      const float32x4_t vi0 = vld1q_f32(i0);
      const float32x4_t vi1 = vld1q_f32(i1);
      const float32x4_t vi2 = vld1q_f32(i2);
      const float32x4_t vi3 = vld1q_f32(i3);
      const float32x4_t vi4 = vld1q_f32(i4);
      const float32x4_t vi5 = vld1q_f32(i5);
      const float32x4_t vi6 = vld1q_f32(i6);
      const float32x4_t vi7 = vld1q_f32(i7);
      const float32x4_t vi8 = vld1q_f32(i8);

      float32x4_t vmax = vi0;
      uint32x4_t vidx = vmovq_n_u32(0);

      const uint32x4_t vm1 = vcgtq_f32(vi1, vmax);
      vmax = vbslq_f32(vm1, vi1, vmax);
      vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx);

      const uint32x4_t vm2 = vcgtq_f32(vi2, vmax);
      vmax = vbslq_f32(vm2, vi2, vmax);
      vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx);

      const uint32x4_t vm3 = vcgtq_f32(vi3, vmax);
      vmax = vbslq_f32(vm3, vi3, vmax);
      vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx);

      const uint32x4_t vm4 = vcgtq_f32(vi4, vmax);
      vmax = vbslq_f32(vm4, vi4, vmax);
      vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx);

      const uint32x4_t vm5 = vcgtq_f32(vi5, vmax);
      vmax = vbslq_f32(vm5, vi5, vmax);
      vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx);

      const uint32x4_t vm6 = vcgtq_f32(vi6, vmax);
      vmax = vbslq_f32(vm6, vi6, vmax);
      vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx);

      const uint32x4_t vm7 = vcgtq_f32(vi7, vmax);
      vmax = vbslq_f32(vm7, vi7, vmax);
      vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx);

      const uint32x4_t vm8 = vcgtq_f32(vi8, vmax);
      vmax = vbslq_f32(vm8, vi8, vmax);
      vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx);

      float32x2_t vmax_lo = vget_low_f32(vmax);
      uint32x2_t vidx_lo = vget_low_u32(vidx);
      if (c & 2) {
        vst1_f32(output, vmax_lo); output += 2;
        vst1_u32(index, vidx_lo); index += 2;
        vmax_lo = vget_high_f32(vmax);
        vidx_lo = vget_high_u32(vidx);
      }
      if (c & 1) {
        vst1_lane_f32(output, vmax_lo, 0); output += 1;
        vst1_lane_u32(index, vidx_lo, 0); index += 1;
      }
    }
    input = (const float**) ((uintptr_t) input + input_increment);
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    const float* zero,
    float* buffer,
    float* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements > 9);
  assert(channels != 0);

  const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);

  do {
    {
      const float* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }
      const float* i8 = *input++;
      assert(i8 != NULL);
      if XNN_UNPREDICTABLE(i8 != zero) {
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
      }

      float* b = buffer;
      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

        vst1q_f32(b, vsum); b += 4;
      }
    }

    size_t k = kernel_elements;
    for (k -= 9; k > 8; k -= 8) {
      const float* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }

      float* b = buffer;
      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vacc = vld1q_f32(b);

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        vst1q_f32(b, vsum); b += 4;
      }
    }

    assert(k >= 1);
    {
      const float* i0 = input[0];
      assert(i0 != NULL);
      const float* i1 = input[1];
      const float* i2 = input[2];
      const float* i3 = input[3];
      const float* i4 = input[4];
      const float* i5 = input[5];
      const float* i6 = input[6];
      const float* i7 = input[7];
      input = (const float**) ((uintptr_t) input + input_increment);
      if (k < 2) {
        i1 = zero;
      }
      assert(i1 != NULL);
      if (k <= 2) {
        i2 = zero;
      }
      assert(i2 != NULL);
      if (k < 4) {
        i3 = zero;
      }
      assert(i3 != NULL);
      if (k <= 4) {
        i4 = zero;
      }
      assert(i4 != NULL);
      if (k < 6) {
        i5 = zero;
      }
      assert(i5 != NULL);
      if (k <= 6) {
        i6 = zero;
      }
      assert(i6 != NULL);
      if (k < 8) {
        i7 = zero;
      }
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }

      size_t c = channels;
      float* b = buffer;
      while (c >= 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vacc = vld1q_f32(b); b += 4;

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        float32x4_t vout = vmulq_f32(vsum, vscale);
        vout = vmaxq_f32(vout, vmin);
        vout = vminq_f32(vout, vmax);

        vst1q_f32(output, vout); output += 4;

        c -= 4;
      }
      if (c != 0) {
        const float32x4_t vi0 = vld1q_f32(i0);
        const float32x4_t vi1 = vld1q_f32(i1);
        const float32x4_t vi2 = vld1q_f32(i2);
        const float32x4_t vi3 = vld1q_f32(i3);
        const float32x4_t vi4 = vld1q_f32(i4);
        const float32x4_t vi5 = vld1q_f32(i5);
        const float32x4_t vi6 = vld1q_f32(i6);
        const float32x4_t vi7 = vld1q_f32(i7);
        const float32x4_t vacc = vld1q_f32(b);

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        float32x4_t vout = vmulq_f32(vsum, vscale);
        vout = vmaxq_f32(vout, vmin);
        vout = vminq_f32(vout, vmax);

        float32x2_t vout_lo = vget_low_f32(vout);
        if (c & 2) {
          vst1_f32(output, vout_lo); output += 2;
          vout_lo = vget_high_f32(vout);
        }
        if (c & 1) {
          vst1_lane_f32(output, vout_lo, 0); output += 1;
        }
      }
    }
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_avgpool_minmax_ukernel_9x__neon_c4(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    const float* zero,
    float* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(kernel_elements <= 9);
  assert(channels != 0);

  const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);

  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    const float* i1 = input[1];
    const float* i2 = input[2];
    const float* i3 = input[3];
    const float* i4 = input[4];
    const float* i5 = input[5];
    const float* i6 = input[6];
    const float* i7 = input[7];
    const float* i8 = input[8];
    input = (const float**) ((uintptr_t) input + input_increment);
    if (kernel_elements < 2) {
      i1 = zero;
    }
    assert(i1 != NULL);
    if (kernel_elements <= 2) {
      i2 = zero;
    }
    assert(i2 != NULL);
    if (kernel_elements < 4) {
      i3 = zero;
    }
    assert(i3 != NULL);
    if (kernel_elements <= 4) {
      i4 = zero;
    }
    assert(i4 != NULL);
    if (kernel_elements < 6) {
      i5 = zero;
    }
    assert(i5 != NULL);
    if (kernel_elements <= 6) {
      i6 = zero;
    }
    assert(i6 != NULL);
    if (kernel_elements < 8) {
      i7 = zero;
    }
    assert(i7 != NULL);
    if (kernel_elements <= 8) {
      i8 = zero;
    }
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
    }
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
    }
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
    }
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
    }
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
    }
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
    }

    size_t c = channels;
    while (c >= 4) {
      const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

      const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
      const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
      const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
      const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
      const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
      const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
      const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
      const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

      float32x4_t vout = vmulq_f32(vsum, vscale);
      vout = vmaxq_f32(vout, vmin);
      vout = vminq_f32(vout, vmax);

      vst1q_f32(output, vout); output += 4;

      c -= 4;
    }
    if (c != 0) {
      const float32x4_t vi0 = vld1q_f32(i0);
      const float32x4_t vi1 = vld1q_f32(i1);
      const float32x4_t vi2 = vld1q_f32(i2);
      const float32x4_t vi3 = vld1q_f32(i3);
      const float32x4_t vi4 = vld1q_f32(i4);
      const float32x4_t vi5 = vld1q_f32(i5);
      const float32x4_t vi6 = vld1q_f32(i6);
      const float32x4_t vi7 = vld1q_f32(i7);
      const float32x4_t vi8 = vld1q_f32(i8);

      const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
      const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
      const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
      const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
      const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
      const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
      const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
      const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

      float32x4_t vout = vmulq_f32(vsum, vscale);
      vout = vmaxq_f32(vout, vmin);
      vout = vminq_f32(vout, vmax);

      float32x2_t vout_lo = vget_low_f32(vout);
      if (c & 2) {
        vst1_f32(output, vout_lo); output += 2;
        vout_lo = vget_high_f32(vout);
      }
      if (c & 1) {
        vst1_lane_f32(output, vout_lo, 0); output += 1;
      }
    }
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2(
    size_t input_height,
    size_t input_width,
    size_t output_y_start,
    size_t output_y_end,
    const float* input,
    const float* zero,
    const float* weights,
    float* output,
    size_t input_padding_top,
    size_t output_channels,
    size_t output_height_stride,
    size_t output_channel_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(input_width != 0);
  assert(output_y_end > output_y_start);
  assert(input_padding_top <= 1);
  assert(output_channels != 0);

  const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
  const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
  const size_t output_width = (input_width + 1) / 2;
  const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);

  // Adjustment for padding processed below
  const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
  const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
  const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
  const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
  const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
  float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
  float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);

  if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
    i0 = zero;
  }

  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);

  for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
    const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
    const size_t input_y4 = input_y2 + 2;
    if XNN_UNPREDICTABLE(input_y2 >= input_height) {
      i2 = zero;
    }
    if XNN_UNPREDICTABLE(input_y4 > input_height) {
      i3 = zero;
    }
    if XNN_UNPREDICTABLE(input_y4 >= input_height) {
      i4 = zero;
    }
    if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
      output1 = output0;
    }

    const float* w = weights;
    size_t c = output_channels;
    float* o0c0 = output0;
    float* o1c0 = output1;
    float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
    float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
    float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
    float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
    float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
    float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
    do {
      if XNN_UNPREDICTABLE(c < 2) {
        o0c1 = o0c0;
        o1c1 = o1c0;
      }
      if XNN_UNPREDICTABLE(c <= 2) {
        o0c2 = o0c1;
        o1c2 = o1c1;
      }
      if XNN_UNPREDICTABLE(c < 4) {
        o0c3 = o0c2;
        o1c3 = o1c2;
      }

      // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
      float32x4_t vi0x0 = vmovq_n_f32(0.0f);
      float32x4_t vi1x0 = vmovq_n_f32(0.0f);
      float32x4_t vi2x0 = vmovq_n_f32(0.0f);
      float32x4_t vi3x0 = vmovq_n_f32(0.0f);
      float32x4_t vi4x0 = vmovq_n_f32(0.0f);

      size_t iw = input_width;
      for (; iw >= 4; iw -= 4) {
        float32x4_t vo0x0 = vld1q_f32(w);
        float32x4_t vo1x0 = vo0x0;
        float32x4_t vo0x1 = vo0x0;
        float32x4_t vo1x1 = vo0x0;

        const float32x4_t vk00c0 = vld1q_f32(w + 4);

        // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
        const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c0, vget_low_f32(vi0x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c0, vget_low_f32(vi2x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk00c0, vget_high_f32(vi0x1), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk00c0, vget_high_f32(vi2x1), 1);

        const float32x4_t vk10c0 = vld1q_f32(w + 8);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c0, vget_low_f32(vi1x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c0, vget_low_f32(vi3x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk10c0, vget_high_f32(vi1x1), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk10c0, vget_high_f32(vi3x1), 1);

        const float32x4_t vk20c0 = vld1q_f32(w + 12);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c0, vget_low_f32(vi2x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c0, vget_low_f32(vi4x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk20c0, vget_high_f32(vi2x1), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk20c0, vget_high_f32(vi4x1), 1);

        const float32x4_t vk00c1 = vld1q_f32(w + 16);

        // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
        const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c1, vget_high_f32(vi0x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c1, vget_high_f32(vi2x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk00c1, vget_low_f32(vi0x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk00c1, vget_low_f32(vi2x2), 0);

        const float32x4_t vk10c1 = vld1q_f32(w + 20);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0);

        const float32x4_t vk20c1 = vld1q_f32(w + 24);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c1, vget_high_f32(vi2x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c1, vget_high_f32(vi4x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk20c1, vget_low_f32(vi2x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk20c1, vget_low_f32(vi4x2), 0);

        const float32x4_t vk00c2 = vld1q_f32(w + 28);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c2, vget_high_f32(vi0x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c2, vget_high_f32(vi2x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk00c2, vget_low_f32(vi0x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk00c2, vget_low_f32(vi2x2), 1);

        const float32x4_t vk10c2 = vld1q_f32(w + 32);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c2, vget_high_f32(vi1x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c2, vget_high_f32(vi3x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk10c2, vget_low_f32(vi1x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk10c2, vget_low_f32(vi3x2), 1);

        const float32x4_t vk20c2 = vld1q_f32(w + 36);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c2, vget_high_f32(vi2x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c2, vget_high_f32(vi4x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk20c2, vget_low_f32(vi2x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk20c2, vget_low_f32(vi4x2), 1);

        const float32x4_t vk01c0 = vld1q_f32(w + 40);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c0, vget_low_f32(vi0x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c0, vget_low_f32(vi2x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk01c0, vget_high_f32(vi0x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk01c0, vget_high_f32(vi2x2), 0);

        const float32x4_t vk11c0 = vld1q_f32(w + 44);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c0, vget_low_f32(vi1x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c0, vget_low_f32(vi3x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk11c0, vget_high_f32(vi1x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk11c0, vget_high_f32(vi3x2), 0);

        const float32x4_t vk21c0 = vld1q_f32(w + 48);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c0, vget_low_f32(vi2x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c0, vget_low_f32(vi4x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk21c0, vget_high_f32(vi2x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk21c0, vget_high_f32(vi4x2), 0);

        const float32x4_t vk01c1 = vld1q_f32(w + 52);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c1, vget_low_f32(vi0x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c1, vget_low_f32(vi2x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk01c1, vget_high_f32(vi0x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk01c1, vget_high_f32(vi2x2), 1);

        const float32x4_t vk11c1 = vld1q_f32(w + 56);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c1, vget_low_f32(vi1x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c1, vget_low_f32(vi3x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk11c1, vget_high_f32(vi1x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk11c1, vget_high_f32(vi3x2), 1);

        const float32x4_t vk21c1 = vld1q_f32(w + 60);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1);

        const float32x4_t vk01c2 = vld1q_f32(w + 64);

        // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
        const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c2, vget_high_f32(vi0x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c2, vget_high_f32(vi2x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk01c2, vget_low_f32(vi0x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk01c2, vget_low_f32(vi2x3), 0);

        const float32x4_t vk11c2 = vld1q_f32(w + 68);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c2, vget_high_f32(vi1x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c2, vget_high_f32(vi3x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk11c2, vget_low_f32(vi1x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk11c2, vget_low_f32(vi3x3), 0);

        const float32x4_t vk21c2 = vld1q_f32(w + 72);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c2, vget_high_f32(vi2x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c2, vget_high_f32(vi4x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk21c2, vget_low_f32(vi2x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0);

        const float32x4_t vk02c0 = vld1q_f32(w + 76);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk02c0, vget_high_f32(vi0x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk02c0, vget_high_f32(vi2x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk02c0, vget_low_f32(vi0x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk02c0, vget_low_f32(vi2x3), 1);

        const float32x4_t vk12c0 = vld1q_f32(w + 80);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk12c0, vget_high_f32(vi1x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk12c0, vget_high_f32(vi3x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk12c0, vget_low_f32(vi1x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk12c0, vget_low_f32(vi3x3), 1);

        const float32x4_t vk22c0 = vld1q_f32(w + 84);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk22c0, vget_high_f32(vi2x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk22c0, vget_high_f32(vi4x1), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk22c0, vget_low_f32(vi2x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk22c0, vget_low_f32(vi4x3), 1);

        const float32x4_t vk02c1 = vld1q_f32(w + 88);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk02c1, vget_low_f32(vi0x2), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk02c1, vget_low_f32(vi2x2), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk02c1, vget_high_f32(vi0x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk02c1, vget_high_f32(vi2x3), 0);

        const float32x4_t vk12c1 = vld1q_f32(w + 92);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk12c1, vget_low_f32(vi1x2), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk12c1, vget_low_f32(vi3x2), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk12c1, vget_high_f32(vi1x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk12c1, vget_high_f32(vi3x3), 0);

        const float32x4_t vk22c1 = vld1q_f32(w + 96);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk22c1, vget_low_f32(vi2x2), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk22c1, vget_low_f32(vi4x2), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk22c1, vget_high_f32(vi2x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk22c1, vget_high_f32(vi4x3), 0);

        const float32x4_t vk02c2 = vld1q_f32(w + 100);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk02c2, vget_low_f32(vi0x2), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk02c2, vget_low_f32(vi2x2), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk02c2, vget_high_f32(vi0x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk02c2, vget_high_f32(vi2x3), 1);

        const float32x4_t vk12c2 = vld1q_f32(w + 104);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk12c2, vget_low_f32(vi1x2), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk12c2, vget_low_f32(vi3x2), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk12c2, vget_high_f32(vi1x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk12c2, vget_high_f32(vi3x3), 1);

        const float32x4_t vk22c2 = vld1q_f32(w + 108);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk22c2, vget_low_f32(vi2x2), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk22c2, vget_low_f32(vi4x2), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk22c2, vget_high_f32(vi2x3), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk22c2, vget_high_f32(vi4x3), 1);

        vi0x0 = vi0x3;
        vi1x0 = vi1x3;
        vi2x0 = vi2x3;
        vi3x0 = vi3x3;
        vi4x0 = vi4x3;

        vo0x0 = vmaxq_f32(vo0x0, vmin);
        vo1x0 = vmaxq_f32(vo1x0, vmin);
        vo0x1 = vmaxq_f32(vo0x1, vmin);
        vo1x1 = vmaxq_f32(vo1x1, vmin);

        vo0x0 = vminq_f32(vo0x0, vmax);
        vo1x0 = vminq_f32(vo1x0, vmax);
        vo0x1 = vminq_f32(vo0x1, vmax);
        vo1x1 = vminq_f32(vo1x1, vmax);

        const float32x4x2_t vo0c0123 = vzipq_f32(vo0x0, vo0x1);
        const float32x4x2_t vo1c0123 = vzipq_f32(vo1x0, vo1x1);

        // Always 2+ output width elements remaining
        vst1_f32(o1c0, vget_low_f32(vo1c0123.val[0])); o1c0 += 2;
        vst1_f32(o1c1, vget_high_f32(vo1c0123.val[0])); o1c1 += 2;
        vst1_f32(o1c2, vget_low_f32(vo1c0123.val[1])); o1c2 += 2;
        vst1_f32(o1c3, vget_high_f32(vo1c0123.val[1])); o1c3 += 2;

        vst1_f32(o0c0, vget_low_f32(vo0c0123.val[0])); o0c0 += 2;
        vst1_f32(o0c1, vget_high_f32(vo0c0123.val[0])); o0c1 += 2;
        vst1_f32(o0c2, vget_low_f32(vo0c0123.val[1])); o0c2 += 2;
        vst1_f32(o0c3, vget_high_f32(vo0c0123.val[1])); o0c3 += 2;
      }
      assert(iw < 4);
      if XNN_UNLIKELY(iw != 0) {
        float32x4_t vo0x0 = vld1q_f32(w);
        float32x4_t vo1x0 = vo0x0;
        float32x4_t vo0x1 = vo0x0;
        float32x4_t vo1x1 = vo0x0;

        const float32x4_t vk00c0 = vld1q_f32(w + 4);

        // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
        float32x4_t vi0x1 = vld1q_f32(i0);
        float32x4_t vi1x1 = vld1q_f32(i1);
        float32x4_t vi2x1 = vld1q_f32(i2);
        float32x4_t vi3x1 = vld1q_f32(i3);
        float32x4_t vi4x1 = vld1q_f32(i4);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c0, vget_low_f32(vi0x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c0, vget_low_f32(vi2x0), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk00c0, vget_high_f32(vi0x1), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk00c0, vget_high_f32(vi2x1), 1);
        }

        const float32x4_t vk10c0 = vld1q_f32(w + 8);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c0, vget_low_f32(vi1x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c0, vget_low_f32(vi3x0), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk10c0, vget_high_f32(vi1x1), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk10c0, vget_high_f32(vi3x1), 1);
        }

        const float32x4_t vk20c0 = vld1q_f32(w + 12);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c0, vget_low_f32(vi2x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c0, vget_low_f32(vi4x0), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk20c0, vget_high_f32(vi2x1), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk20c0, vget_high_f32(vi4x1), 1);
        }

        const float32x4_t vk00c1 = vld1q_f32(w + 16);

        float32x4_t vi0x2 = vmovq_n_f32(0.0f);
        float32x4_t vi1x2 = vmovq_n_f32(0.0f);
        float32x4_t vi2x2 = vmovq_n_f32(0.0f);
        float32x4_t vi3x2 = vmovq_n_f32(0.0f);
        float32x4_t vi4x2 = vmovq_n_f32(0.0f);
        if (iw >= 2) {
          // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
          vi0x2 = vld1q_f32(i0 + 4);
          vi1x2 = vld1q_f32(i1 + 4);
          vi2x2 = vld1q_f32(i2 + 4);
          vi3x2 = vld1q_f32(i3 + 4);
          vi4x2 = vld1q_f32(i4 + 4);
        }

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c1, vget_high_f32(vi0x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c1, vget_high_f32(vi2x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk00c1, vget_low_f32(vi0x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk00c1, vget_low_f32(vi2x2), 0);

        const float32x4_t vk10c1 = vld1q_f32(w + 20);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0);

        const float32x4_t vk20c1 = vld1q_f32(w + 24);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c1, vget_high_f32(vi2x0), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c1, vget_high_f32(vi4x0), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk20c1, vget_low_f32(vi2x2), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk20c1, vget_low_f32(vi4x2), 0);

        const float32x4_t vk00c2 = vld1q_f32(w + 28);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk00c2, vget_high_f32(vi0x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk00c2, vget_high_f32(vi2x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk00c2, vget_low_f32(vi0x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk00c2, vget_low_f32(vi2x2), 1);

        const float32x4_t vk10c2 = vld1q_f32(w + 32);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk10c2, vget_high_f32(vi1x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk10c2, vget_high_f32(vi3x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk10c2, vget_low_f32(vi1x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk10c2, vget_low_f32(vi3x2), 1);

        const float32x4_t vk20c2 = vld1q_f32(w + 36);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk20c2, vget_high_f32(vi2x0), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk20c2, vget_high_f32(vi4x0), 1);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk20c2, vget_low_f32(vi2x2), 1);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk20c2, vget_low_f32(vi4x2), 1);

        const float32x4_t vk01c0 = vld1q_f32(w + 40);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c0, vget_low_f32(vi0x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c0, vget_low_f32(vi2x1), 0);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk01c0, vget_high_f32(vi0x2), 0);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk01c0, vget_high_f32(vi2x2), 0);
        }

        const float32x4_t vk11c0 = vld1q_f32(w + 44);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c0, vget_low_f32(vi1x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c0, vget_low_f32(vi3x1), 0);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk11c0, vget_high_f32(vi1x2), 0);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk11c0, vget_high_f32(vi3x2), 0);
        }

        const float32x4_t vk21c0 = vld1q_f32(w + 48);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c0, vget_low_f32(vi2x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c0, vget_low_f32(vi4x1), 0);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk21c0, vget_high_f32(vi2x2), 0);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk21c0, vget_high_f32(vi4x2), 0);
        }

        const float32x4_t vk01c1 = vld1q_f32(w + 52);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c1, vget_low_f32(vi0x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c1, vget_low_f32(vi2x1), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk01c1, vget_high_f32(vi0x2), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk01c1, vget_high_f32(vi2x2), 1);
        }

        const float32x4_t vk11c1 = vld1q_f32(w + 56);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c1, vget_low_f32(vi1x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c1, vget_low_f32(vi3x1), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk11c1, vget_high_f32(vi1x2), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk11c1, vget_high_f32(vi3x2), 1);
        }

        const float32x4_t vk21c1 = vld1q_f32(w + 60);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1);
        if (iw > 2) {
          vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1);
          vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1);
        }

        const float32x4_t vk01c2 = vld1q_f32(w + 64);

        float32x4_t vi0x3 = vmovq_n_f32(0.0f);
        float32x4_t vi1x3 = vmovq_n_f32(0.0f);
        float32x4_t vi2x3 = vmovq_n_f32(0.0f);
        float32x4_t vi3x3 = vmovq_n_f32(0.0f);
        float32x4_t vi4x3 = vmovq_n_f32(0.0f);
        if (iw > 2) {
          // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
          vi0x3 = vld1q_lane_f32(i0 + 8, vi0x3, 0);
          vi1x3 = vld1q_lane_f32(i1 + 8, vi1x3, 0);
          vi2x3 = vld1q_lane_f32(i2 + 8, vi2x3, 0);
          vi3x3 = vld1q_lane_f32(i3 + 8, vi3x3, 0);
          vi4x3 = vld1q_lane_f32(i4 + 8, vi4x3, 0);
        }

        vo0x0 = vmlaq_lane_f32(vo0x0, vk01c2, vget_high_f32(vi0x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk01c2, vget_high_f32(vi2x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk01c2, vget_low_f32(vi0x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk01c2, vget_low_f32(vi2x3), 0);

        const float32x4_t vk11c2 = vld1q_f32(w + 68);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk11c2, vget_high_f32(vi1x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk11c2, vget_high_f32(vi3x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk11c2, vget_low_f32(vi1x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk11c2, vget_low_f32(vi3x3), 0);

        const float32x4_t vk21c2 = vld1q_f32(w + 72);

        vo0x0 = vmlaq_lane_f32(vo0x0, vk21c2, vget_high_f32(vi2x1), 0);
        vo1x0 = vmlaq_lane_f32(vo1x0, vk21c2, vget_high_f32(vi4x1), 0);
        vo0x1 = vmlaq_lane_f32(vo0x1, vk21c2, vget_low_f32(vi2x3), 0);
        vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0);

        if (iw >= 2) {
          const float32x4_t vk02c0 = vld1q_f32(w + 76);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk02c0, vget_high_f32(vi0x1), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk02c0, vget_high_f32(vi2x1), 1);

          const float32x4_t vk12c0 = vld1q_f32(w + 80);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk12c0, vget_high_f32(vi1x1), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk12c0, vget_high_f32(vi3x1), 1);

          const float32x4_t vk22c0 = vld1q_f32(w + 84);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk22c0, vget_high_f32(vi2x1), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk22c0, vget_high_f32(vi4x1), 1);

          const float32x4_t vk02c1 = vld1q_f32(w + 88);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk02c1, vget_low_f32(vi0x2), 0);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk02c1, vget_low_f32(vi2x2), 0);

          const float32x4_t vk12c1 = vld1q_f32(w + 92);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk12c1, vget_low_f32(vi1x2), 0);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk12c1, vget_low_f32(vi3x2), 0);

          const float32x4_t vk22c1 = vld1q_f32(w + 96);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk22c1, vget_low_f32(vi2x2), 0);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk22c1, vget_low_f32(vi4x2), 0);

          const float32x4_t vk02c2 = vld1q_f32(w + 100);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk02c2, vget_low_f32(vi0x2), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk02c2, vget_low_f32(vi2x2), 1);

          const float32x4_t vk12c2 = vld1q_f32(w + 104);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk12c2, vget_low_f32(vi1x2), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk12c2, vget_low_f32(vi3x2), 1);

          const float32x4_t vk22c2 = vld1q_f32(w + 108);

          vo0x0 = vmlaq_lane_f32(vo0x0, vk22c2, vget_low_f32(vi2x2), 1);
          vo1x0 = vmlaq_lane_f32(vo1x0, vk22c2, vget_low_f32(vi4x2), 1);
        }

        vo0x0 = vmaxq_f32(vo0x0, vmin);
        vo1x0 = vmaxq_f32(vo1x0, vmin);
        vo0x1 = vmaxq_f32(vo0x1, vmin);
        vo1x1 = vmaxq_f32(vo1x1, vmin);

        vo0x0 = vminq_f32(vo0x0, vmax);
        vo1x0 = vminq_f32(vo1x0, vmax);
        vo0x1 = vminq_f32(vo0x1, vmax);
        vo1x1 = vminq_f32(vo1x1, vmax);

        if (iw == 3) {
          // Exactly 2 output width elements remaining
          const float32x4x2_t vo0c0123 = vzipq_f32(vo0x0, vo0x1);
          const float32x4x2_t vo1c0123 = vzipq_f32(vo1x0, vo1x1);

          vst1_f32(o1c0, vget_low_f32(vo1c0123.val[0])); o1c0 += 2;
          vst1_f32(o1c1, vget_high_f32(vo1c0123.val[0])); o1c1 += 2;
          vst1_f32(o1c2, vget_low_f32(vo1c0123.val[1])); o1c2 += 2;
          vst1_f32(o1c3, vget_high_f32(vo1c0123.val[1])); o1c3 += 2;

          vst1_f32(o0c0, vget_low_f32(vo0c0123.val[0])); o0c0 += 2;
          vst1_f32(o0c1, vget_high_f32(vo0c0123.val[0])); o0c1 += 2;
          vst1_f32(o0c2, vget_low_f32(vo0c0123.val[1])); o0c2 += 2;
          vst1_f32(o0c3, vget_high_f32(vo0c0123.val[1])); o0c3 += 2;
        } else {
          // Exactly 1 output width element remaining

          vst1q_lane_f32(o1c0, vo1x0, 0); o1c0 += 1;
          vst1q_lane_f32(o1c1, vo1x0, 1); o1c1 += 1;
          vst1q_lane_f32(o1c2, vo1x0, 2); o1c2 += 1;
          vst1q_lane_f32(o1c3, vo1x0, 3); o1c3 += 1;

          vst1q_lane_f32(o0c0, vo0x0, 0); o0c0 += 1;
          vst1q_lane_f32(o0c1, vo0x0, 1); o0c1 += 1;
          vst1q_lane_f32(o0c2, vo0x0, 2); o0c2 += 1;
          vst1q_lane_f32(o0c3, vo0x0, 3); o0c3 += 1;
        }
      }
      // Move output pointers back to the position of the first pixel in a row,
      // and forward to the next block of output channels.
      o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
      o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
      o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
      o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
      o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
      o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
      o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
      o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
      // Revert input pointers to the position of the first pixel in a row
      i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
      i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
      i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
      i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
      i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
      // Move to the block of weights for the next 4 output channels
      w += 112;
      c = doz(c, 4);
    } while (c != 0);
    // Move output pointers forward to the next two rows
    output0 = (float*) ((uintptr_t) output1 + output_height_stride);
    output1 = (float*) ((uintptr_t) output0 + output_height_stride);
    // Move input pointers forward to the next four rows
    i0 = i4;
    i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
    i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
    i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
    i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
  }
}

void xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2(
    size_t channels,
    size_t output_width,
    const float** input,
    const float* weights,
    float* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    const float* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    const float* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }
    const float* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
    }
    const float* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
    }
    const float* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
    }
    const float* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
    }
    const float* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
    }
    const float* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
    }
    const float* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const float*) ((uintptr_t) i9 + input_offset);
    }
    const float* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const float*) ((uintptr_t) i10 + input_offset);
    }
    const float* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const float*) ((uintptr_t) i11 + input_offset);
    }
    const float* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const float*) ((uintptr_t) i12 + input_offset);
    }
    const float* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const float*) ((uintptr_t) i13 + input_offset);
    }
    const float* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const float*) ((uintptr_t) i14 + input_offset);
    }
    const float* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const float*) ((uintptr_t) i15 + input_offset);
    }
    const float* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const float*) ((uintptr_t) i16 + input_offset);
    }
    const float* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const float*) ((uintptr_t) i17 + input_offset);
    }
    const float* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const float*) ((uintptr_t) i18 + input_offset);
    }
    const float* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const float*) ((uintptr_t) i19 + input_offset);
    }
    const float* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const float*) ((uintptr_t) i20 + input_offset);
    }
    const float* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const float*) ((uintptr_t) i21 + input_offset);
    }
    const float* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const float*) ((uintptr_t) i22 + input_offset);
    }
    const float* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const float*) ((uintptr_t) i23 + input_offset);
    }
    const float* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const float*) ((uintptr_t) i24 + input_offset);
    }

    input = (const float**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const float* w = weights;
    for (; c >= 8; c -= 8) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
      float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi3x4567, vk3x4567);

      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567);

      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi5x4567, vk5x4567);

      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567);

      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi7x4567, vk7x4567);

      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567);

      const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4;
      const float32x4_t vi9x4567 = vld1q_f32(i9); i9 += 4;
      const float32x4_t vk9x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk9x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi9x4567, vk9x4567);

      const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4;
      const float32x4_t vi10x4567 = vld1q_f32(i10); i10 += 4;
      const float32x4_t vk10x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk10x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi10x4567, vk10x4567);

      const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4;
      const float32x4_t vi11x4567 = vld1q_f32(i11); i11 += 4;
      const float32x4_t vk11x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk11x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi11x4567, vk11x4567);

      const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4;
      const float32x4_t vi12x4567 = vld1q_f32(i12); i12 += 4;
      const float32x4_t vk12x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk12x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi12x4567, vk12x4567);

      const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4;
      const float32x4_t vi13x4567 = vld1q_f32(i13); i13 += 4;
      const float32x4_t vk13x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk13x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi13x4567, vk13x4567);

      const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4;
      const float32x4_t vi14x4567 = vld1q_f32(i14); i14 += 4;
      const float32x4_t vk14x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk14x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi14x4567, vk14x4567);

      const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4;
      const float32x4_t vi15x4567 = vld1q_f32(i15); i15 += 4;
      const float32x4_t vk15x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk15x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi15x4567, vk15x4567);

      const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4;
      const float32x4_t vi16x4567 = vld1q_f32(i16); i16 += 4;
      const float32x4_t vk16x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk16x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi16x4567, vk16x4567);

      const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4;
      const float32x4_t vi17x4567 = vld1q_f32(i17); i17 += 4;
      const float32x4_t vk17x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk17x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi17x4567, vk17x4567);

      const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4;
      const float32x4_t vi18x4567 = vld1q_f32(i18); i18 += 4;
      const float32x4_t vk18x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk18x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi18x4567, vk18x4567);

      const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4;
      const float32x4_t vi19x4567 = vld1q_f32(i19); i19 += 4;
      const float32x4_t vk19x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk19x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi19x4567, vk19x4567);

      const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4;
      const float32x4_t vi20x4567 = vld1q_f32(i20); i20 += 4;
      const float32x4_t vk20x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk20x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi20x4567, vk20x4567);

      const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4;
      const float32x4_t vi21x4567 = vld1q_f32(i21); i21 += 4;
      const float32x4_t vk21x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk21x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi21x4567, vk21x4567);

      const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4;
      const float32x4_t vi22x4567 = vld1q_f32(i22); i22 += 4;
      const float32x4_t vk22x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk22x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi22x4567, vk22x4567);

      const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4;
      const float32x4_t vi23x4567 = vld1q_f32(i23); i23 += 4;
      const float32x4_t vk23x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk23x4567 = vld1q_f32(w); w += 4;
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123);
      vacc4567p1 = vmlaq_f32(vacc4567p1, vi23x4567, vk23x4567);

      const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4;
      const float32x4_t vi24x4567 = vld1q_f32(i24); i24 += 4;
      const float32x4_t vk24x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk24x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi24x4567, vk24x4567);

      // Add up all accumulators to vacc01234567p0
      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
      vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);

      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);
      vacc4567 = vminq_f32(vacc4567, vmax);

      vst1q_f32(output, vacc0123); output += 4;
      vst1q_f32(output, vacc4567); output += 4;
    }
    for (; c >= 4; c -= 4) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);

      const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4;
      const float32x4_t vk9x0123 = vld1q_f32(w + 76);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123);

      const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4;
      const float32x4_t vk10x0123 = vld1q_f32(w + 84);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123);

      const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4;
      const float32x4_t vk11x0123 = vld1q_f32(w + 92);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123);

      const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4;
      const float32x4_t vk12x0123 = vld1q_f32(w + 100);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123);

      const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4;
      const float32x4_t vk13x0123 = vld1q_f32(w + 108);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123);

      const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4;
      const float32x4_t vk14x0123 = vld1q_f32(w + 116);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123);

      const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4;
      const float32x4_t vk15x0123 = vld1q_f32(w + 124);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123);

      const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4;
      const float32x4_t vk16x0123 = vld1q_f32(w + 132);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123);

      const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4;
      const float32x4_t vk17x0123 = vld1q_f32(w + 140);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123);

      const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4;
      const float32x4_t vk18x0123 = vld1q_f32(w + 148);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123);

      const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4;
      const float32x4_t vk19x0123 = vld1q_f32(w + 156);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123);

      const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4;
      const float32x4_t vk20x0123 = vld1q_f32(w + 164);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123);

      const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4;
      const float32x4_t vk21x0123 = vld1q_f32(w + 172);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123);

      const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4;
      const float32x4_t vk22x0123 = vld1q_f32(w + 180);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123);

      const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4;
      const float32x4_t vk23x0123 = vld1q_f32(w + 188);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123);

      const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4;
      const float32x4_t vk24x0123 = vld1q_f32(w + 196);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123);

      // Add up all accumulators to vacc0123p0
      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      vst1q_f32(output, vacc0123); output += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      float32x4_t vacc0123p0 = vld1q_f32(w);


      const float32x4_t vi0x0123 = vld1q_f32(i0);
      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1);
      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2);
      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3);
      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

      const float32x4_t vi4x0123 = vld1q_f32(i4);
      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

      const float32x4_t vi5x0123 = vld1q_f32(i5);
      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

      const float32x4_t vi6x0123 = vld1q_f32(i6);
      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

      const float32x4_t vi7x0123 = vld1q_f32(i7);
      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

      const float32x4_t vi8x0123 = vld1q_f32(i8);
      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);

      const float32x4_t vi9x0123 = vld1q_f32(i9);
      const float32x4_t vk9x0123 = vld1q_f32(w + 80);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123);

      const float32x4_t vi10x0123 = vld1q_f32(i10);
      const float32x4_t vk10x0123 = vld1q_f32(w + 88);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123);

      const float32x4_t vi11x0123 = vld1q_f32(i11);
      const float32x4_t vk11x0123 = vld1q_f32(w + 96);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123);

      const float32x4_t vi12x0123 = vld1q_f32(i12);
      const float32x4_t vk12x0123 = vld1q_f32(w + 104);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123);

      const float32x4_t vi13x0123 = vld1q_f32(i13);
      const float32x4_t vk13x0123 = vld1q_f32(w + 112);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123);

      const float32x4_t vi14x0123 = vld1q_f32(i14);
      const float32x4_t vk14x0123 = vld1q_f32(w + 120);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123);

      const float32x4_t vi15x0123 = vld1q_f32(i15);
      const float32x4_t vk15x0123 = vld1q_f32(w + 128);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123);

      const float32x4_t vi16x0123 = vld1q_f32(i16);
      const float32x4_t vk16x0123 = vld1q_f32(w + 136);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123);

      const float32x4_t vi17x0123 = vld1q_f32(i17);
      const float32x4_t vk17x0123 = vld1q_f32(w + 144);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123);

      const float32x4_t vi18x0123 = vld1q_f32(i18);
      const float32x4_t vk18x0123 = vld1q_f32(w + 152);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123);

      const float32x4_t vi19x0123 = vld1q_f32(i19);
      const float32x4_t vk19x0123 = vld1q_f32(w + 160);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123);

      const float32x4_t vi20x0123 = vld1q_f32(i20);
      const float32x4_t vk20x0123 = vld1q_f32(w + 168);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123);

      const float32x4_t vi21x0123 = vld1q_f32(i21);
      const float32x4_t vk21x0123 = vld1q_f32(w + 176);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123);

      const float32x4_t vi22x0123 = vld1q_f32(i22);
      const float32x4_t vk22x0123 = vld1q_f32(w + 184);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123);

      const float32x4_t vi23x0123 = vld1q_f32(i23);
      const float32x4_t vk23x0123 = vld1q_f32(w + 192);
      vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123);

      const float32x4_t vi24x0123 = vld1q_f32(i24);
      const float32x4_t vk24x0123 = vld1q_f32(w + 200);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123);

      // Add up all accumulators to vacc0123p0
      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      float32x2_t vacc01 = vget_low_f32(vacc0123);
      if (c & 2) {
        vst1_f32(output, vacc01); output += 2;
        vacc01 = vget_high_f32(vacc0123);
      }
      if (c & 1) {
        vst1_lane_f32(output, vacc01, 0); output += 1;
      }
    }

    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_f32_dwconv_minmax_ukernel_3p8c__neon(
    size_t channels,
    size_t output_width,
    const float** input,
    const float* weights,
    float* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    const float* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    const float* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }

    input = (const float**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const float* w = weights;
    for (; c >= 8; c -= 8) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);
      vacc4567 = vminq_f32(vacc4567, vmax);

      vst1q_f32(output, vacc0123); output += 4;
      vst1q_f32(output, vacc4567); output += 4;
    }
    for (; c >= 4; c -= 4) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      vst1q_f32(output, vacc0123); output += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      float32x4_t vacc0123p0 = vld1q_f32(w);


      const float32x4_t vi0x0123 = vld1q_f32(i0);
      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1);
      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2);
      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      float32x2_t vacc01 = vget_low_f32(vacc0123);
      if (c & 2) {
        vst1_f32(output, vacc01); output += 2;
        vacc01 = vget_high_f32(vacc0123);
      }
      if (c & 1) {
        vst1_lane_f32(output, vacc01, 0); output += 1;
      }
    }

    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_f32_dwconv_minmax_ukernel_4p8c__neon(
    size_t channels,
    size_t output_width,
    const float** input,
    const float* weights,
    float* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    const float* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    const float* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }
    const float* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
    }

    input = (const float**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const float* w = weights;
    for (; c >= 8; c -= 8) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi3x4567, vk3x4567);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);
      vacc4567 = vminq_f32(vacc4567, vmax);

      vst1q_f32(output, vacc0123); output += 4;
      vst1q_f32(output, vacc4567); output += 4;
    }
    for (; c >= 4; c -= 4) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      vst1q_f32(output, vacc0123); output += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      float32x4_t vacc0123p0 = vld1q_f32(w);


      const float32x4_t vi0x0123 = vld1q_f32(i0);
      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1);
      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2);
      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3);
      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      float32x2_t vacc01 = vget_low_f32(vacc0123);
      if (c & 2) {
        vst1_f32(output, vacc01); output += 2;
        vacc01 = vget_high_f32(vacc0123);
      }
      if (c & 1) {
        vst1_lane_f32(output, vacc01, 0); output += 1;
      }
    }

    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2(
    size_t channels,
    size_t output_width,
    const float** input,
    const float* weights,
    float* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const float* zero,
    size_t kernel_size,
    float* buffer,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);
  assert(kernel_size > 8);

  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  do {
    const float* w = weights;

    // First pass to process 8 inputs.
    {
      float* b = buffer;
      const float* i0 = input[0];
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = input[1];
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = input[2];
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = input[3];
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = input[4];
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = input[5];
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = input[6];
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = input[7];
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }
      input += 8;

      // Process c channels and write to buffer.
      size_t c = 0;
      for (; c < channels; c += 4) {
        float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;


        const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;

        const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

        const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;

        const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
        float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

        const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;

        const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

        const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;

        const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

        const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;

        const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

        const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;

        const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

        const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;

        const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

        const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;

        const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

        // Add up all accumulators to vacc0123p0
        vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

        vst1q_f32(b, vacc0123p0); b += 4;
      }
    }

    // Middle pass to process 8 inputs in each iteration.
    for (size_t ks = kernel_size - 8; ks > 9; ks -= 8) {
      float* b = buffer;
      const float* i0 = input[0];
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = input[1];
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = input[2];
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = input[3];
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = input[4];
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = input[5];
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = input[6];
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = input[7];
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }
      input += 8;

      size_t c = 0;
      for (; c < channels; c += 4) {
        float32x4_t vacc0123p0 = vld1q_f32(b);


        const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;

        const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

        const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;

        const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
        float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

        const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;

        const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

        const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;

        const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

        const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;

        const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

        const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;

        const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

        const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;

        const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

        const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;

        const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

        // Add up all accumulators to vacc0123p0
        vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

        vst1q_f32(b, vacc0123p0); b += 4;
      }
    }

    // Last pass to process up to 9 inputs.
    {
      float* b = buffer;
      const float* i0 = input[0];
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = input[1];
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = input[2];
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = input[3];
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = input[4];
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = input[5];
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = input[6];
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = input[7];
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }
      const float* i8 = input[8];
      assert(i8 != NULL);
      if XNN_UNPREDICTABLE(i8 != zero) {
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
      }

      size_t c = channels;


      for (; c >= 4; c -= 4) {
        float32x4_t vacc0123p0 = vld1q_f32(b); b += 4;


        const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;

        float32x4_t vk0x0123 = vld1q_f32(w); w += 4;

        vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

        const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;

        float32x4_t vk1x0123 = vld1q_f32(w); w += 4;

        float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

        const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;

        float32x4_t vk2x0123 = vld1q_f32(w); w += 4;

        vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

        const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;

        float32x4_t vk3x0123 = vld1q_f32(w); w += 4;

        vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

        const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;

        float32x4_t vk4x0123 = vld1q_f32(w); w += 4;

        vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

        const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;

        float32x4_t vk5x0123 = vld1q_f32(w); w += 4;

        vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

        const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;

        float32x4_t vk6x0123 = vld1q_f32(w); w += 4;

        vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

        const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;

        float32x4_t vk7x0123 = vld1q_f32(w); w += 4;

        vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

        const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;

        float32x4_t vk8x0123 = vld1q_f32(w); w += 4;

        vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);


        // Add up all accumulators to vacc0123p0
        vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

        float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);

        vacc0123 = vminq_f32(vacc0123, vmax);

        vst1q_f32(output, vacc0123); output += 4;
      }

      if XNN_UNLIKELY(c != 0) {
        float32x4_t vacc0123p0 = vld1q_f32(b);

        const float32x4_t vi0x0123 = vld1q_f32(i0);
        float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

        const float32x4_t vi1x0123 = vld1q_f32(i1);
        float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
        float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);

        const float32x4_t vi2x0123 = vld1q_f32(i2);
        float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

        const float32x4_t vi3x0123 = vld1q_f32(i3);
        float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);

        const float32x4_t vi4x0123 = vld1q_f32(i4);
        float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

        const float32x4_t vi5x0123 = vld1q_f32(i5);
        float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);

        const float32x4_t vi6x0123 = vld1q_f32(i6);
        float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

        const float32x4_t vi7x0123 = vld1q_f32(i7);
        float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
        vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);

        const float32x4_t vi8x0123 = vld1q_f32(i8);
        float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
        vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);

        // Add up all accumulators to vacc0123p0
        vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);

        float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
        vacc0123 = vminq_f32(vacc0123, vmax);

        float32x2_t vacc01 = vget_low_f32(vacc0123);
        if (c & 2) {
          vst1_f32(output, vacc01); output += 2;
          vacc01 = vget_high_f32(vacc0123);
        }
        if (c & 1) {
          vst1_lane_f32(output, vacc01, 0); output += 1;
        }
      }

    }
    input = (const float**) ((uintptr_t) input + input_stride);
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_f32_dwconv_minmax_ukernel_9p8c__neon(
    size_t channels,
    size_t output_width,
    const float** input,
    const float* weights,
    float* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    const float* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    const float* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }
    const float* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
    }
    const float* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
    }
    const float* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
    }
    const float* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
    }
    const float* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
    }
    const float* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
    }

    input = (const float**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const float* w = weights;
    for (; c >= 8; c -= 8) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi3x4567, vk3x4567);

      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567);

      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi5x4567, vk5x4567);

      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567);

      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi7x4567, vk7x4567);

      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
      vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);
      vacc4567 = vminq_f32(vacc4567, vmax);

      vst1q_f32(output, vacc0123); output += 4;
      vst1q_f32(output, vacc4567); output += 4;
    }
    for (; c >= 4; c -= 4) {
      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;


      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);

      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);

      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);

      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      vst1q_f32(output, vacc0123); output += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      float32x4_t vacc0123p0 = vld1q_f32(w);


      const float32x4_t vi0x0123 = vld1q_f32(i0);
      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);

      const float32x4_t vi1x0123 = vld1q_f32(i1);
      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);

      const float32x4_t vi2x0123 = vld1q_f32(i2);
      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);

      const float32x4_t vi3x0123 = vld1q_f32(i3);
      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);

      const float32x4_t vi4x0123 = vld1q_f32(i4);
      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);

      const float32x4_t vi5x0123 = vld1q_f32(i5);
      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);

      const float32x4_t vi6x0123 = vld1q_f32(i6);
      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);

      const float32x4_t vi7x0123 = vld1q_f32(i7);
      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);

      const float32x4_t vi8x0123 = vld1q_f32(i8);
      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);


      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
      vacc0123 = vminq_f32(vacc0123, vmax);

      float32x2_t vacc01 = vget_low_f32(vacc0123);
      if (c & 2) {
        vst1_f32(output, vacc01); output += 2;
        vacc01 = vget_high_f32(vacc0123);
      }
      if (c & 1) {
        vst1_lane_f32(output, vacc01, 0); output += 1;
      }
    }

    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4(
    size_t input_height,
    size_t input_width,
    const float* input,
    const float* weights,
    const float* zero,
    float* output,
    uint32_t padding_top,
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(input_height != 0);
  assert(input_width != 0);
  assert(input_width % sizeof(float) == 0);
  assert(padding_top == 1);

  const uint32x4_t vmask = vld1q_u32(params->neon_stride1.mask);
  const float32x4_t vmax = vld1q_dup_f32(&params->neon_stride1.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->neon_stride1.min);

  const float32x4_t vw0123 = vld1q_f32(weights);
  const float32x4_t vw4567 = vld1q_f32(weights + 4);
  const float32x2_t vw89 = vld1_f32(weights + 8);

  const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));

  const float* i0 = zero;
  const float* i1 = input;
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);

  float* o0 = output;
  float* o1 = (float*) ((uintptr_t) o0 + input_width);

  size_t output_height = input_height;
  do {
    if XNN_UNPREDICTABLE(output_height < 2) {
      i2 = zero;
      o1 = o0;
    }
    if XNN_UNPREDICTABLE(output_height < 3) {
      i3 = zero;
    }

    float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi3x0123 = vmovq_n_f32(0.0f);

    float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
    float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
    float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
    float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;

    size_t w = input_width;
    for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
      float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0);

      const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
      const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
      const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
      const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1);

      vi0x0123 = vi0x4567;
      vi1x0123 = vi1x4567;
      vi2x0123 = vi2x4567;
      vi3x0123 = vi3x4567;

      const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
      const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
      const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
      const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1);

      vi0x4567 = vi0x89AB;
      vi1x4567 = vi1x89AB;
      vi2x4567 = vi2x89AB;
      vi3x4567 = vi3x89AB;


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
      float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);

      vo0 = vminq_f32(vo0, vmax);
      vo1 = vminq_f32(vo1, vmax);

      vst1q_f32(o1, vo1); o1 += 4;
      vst1q_f32(o0, vo0); o0 += 4;
    }
    // Always process the last block of 1..4 pixels.
    assert(w >= 1 * sizeof(float));
    assert(w <= 4 * sizeof(float));
    {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
      float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
      vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
      vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
      vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0);

      const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
      const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
      const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
      const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1);

      const float32x4_t vzero = vmovq_n_f32(0.0f);
      const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
      const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
      const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
      const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1);
      vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
      float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);

      vo0 = vminq_f32(vo0, vmax);
      vo1 = vminq_f32(vo1, vmax);

      if XNN_LIKELY(w == 4 * sizeof(float)) {
        vst1q_f32(o1, vo1); o1 += 4;
        vst1q_f32(o0, vo0); o0 += 4;
      } else {
        float32x2_t vo0_lo = vget_low_f32(vo0);
        float32x2_t vo1_lo = vget_low_f32(vo1);
        if (w & (2 * sizeof(float))) {
          vst1_f32(o1, vo1_lo); o1 += 2;
          vst1_f32(o0, vo0_lo); o0 += 2;

          vo0_lo = vget_high_f32(vo0);
          vo1_lo = vget_high_f32(vo1);
        }
        if (w & (1 * sizeof(float))) {
          vst1_lane_f32(o1, vo1_lo, 0); o1 += 1;
          vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
        }
      }
    }

    i0 = (const float*) ((uintptr_t) i2 - input_decrement);
    i1 = (const float*) ((uintptr_t) i3 - input_decrement);
    i2 = (const float*) ((uintptr_t) i1 + input_width);
    i3 = (const float*) ((uintptr_t) i2 + input_width);

    o0 = o1;
    o1 = (float*) ((uintptr_t) o0 + input_width);

    output_height = doz(output_height, 2);
  } while (output_height != 0);
}

void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4(
    size_t input_height,
    size_t input_width,
    const float* input,
    const float* weights,
    const float* zero,
    float* output,
    uint32_t padding_top,
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(input_height != 0);
  assert(input_width != 0);
  assert(input_width % sizeof(float) == 0);
  assert(padding_top >= 0);
  assert(padding_top <= 1);

  const uint32x4_t vmask_even = vld1q_u32(params->neon_stride2.mask_even);
  const uint32x4_t vmask_odd  = vld1q_u32(params->neon_stride2.mask_odd);
  const float32x4_t vmax = vld1q_dup_f32(&params->neon_stride2.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->neon_stride2.min);

  const float32x4_t vw0123 = vld1q_f32(weights);
  const float32x4_t vw4567 = vld1q_f32(weights + 4);
  const float32x2_t vw89 = vld1_f32(weights + 8);

  const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));

  const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
  const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
  if XNN_UNPREDICTABLE(padding_top != 0) {
    i0 = zero;
  }
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);

  float* o0 = output;

  size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
  size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
  do {
    if XNN_UNPREDICTABLE(padded_input_height < 4) {
      i2 = zero;
    }

    float32x4_t vi0x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi1x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi2x1357 = vmovq_n_f32(0.0f);

    size_t w = input_width;
    for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); i0 += 8;
      const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); i1 += 8;
      const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); i2 += 8;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vw89, 0);

      const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x8ACE9BDF.val[1], 3);
      vi0x1357 = vi0x8ACE9BDF.val[1];
      const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x8ACE9BDF.val[1], 3);
      vi1x1357 = vi1x8ACE9BDF.val[1];
      const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x8ACE9BDF.val[1], 3);
      vi2x1357 = vi2x8ACE9BDF.val[1];

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vw89, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      vst1q_f32(o0, vo0); o0 += 4;
    }
    // Last block has 0-7 pixels to process.
    assert(w < 8 * sizeof(float));
    if XNN_LIKELY(w != 0) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0);
      const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1);
      const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2);

      const float32x4_t vi0x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[0])));
      const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi0x8ACE9BDF.val[1])));
      const float32x4_t vi1x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[0])));
      const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi1x8ACE9BDF.val[1])));
      const float32x4_t vi2x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[0])));
      const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd,  vreinterpretq_u32_f32(vi2x8ACE9BDF.val[1])));

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE, vw89, 0);

      const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x9BDF, 3);
      const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x9BDF, 3);
      const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x9BDF, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x9BDF, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x9BDF, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x9BDF, vw89, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      w += 1 * sizeof(float);
      if (w & (8 * sizeof(float))) {
        vst1q_f32(o0, vo0); o0 += 4;
      } else {
        float32x2_t vo0_lo = vget_low_f32(vo0);
        if (w & (4 * sizeof(float))) {
          vst1_f32(o0, vo0_lo); o0 += 2;

          vo0_lo = vget_high_f32(vo0);
        }
        if (w & (2 * sizeof(float))) {
          vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
        }
      }
    }

    i0 = (const float*) ((uintptr_t) i2 - input_decrement);
    i1 = (const float*) ((uintptr_t) i0 + input_width);
    i2 = (const float*) ((uintptr_t) i1 + input_width);


    output_height -= 1;
    padded_input_height -= 2;
  } while (output_height != 0);
}

void xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4(
    size_t input_height,
    size_t input_width,
    const float* input,
    const float* weights,
    const float* zero,
    float* output,
    uint32_t padding_top,
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(input_height != 0);
  assert(input_width != 0);
  assert(input_width % sizeof(float) == 0);
  assert(padding_top == 2);

  const uint32x4_t vmask = vld1q_u32(params->neon_stride1.mask);
  const float32x4_t vmax = vld1q_dup_f32(&params->neon_stride1.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->neon_stride1.min);

  const float32x4_t vw0123 = vld1q_f32(weights);
  const float32x4_t vw4567 = vld1q_f32(weights + 4);
  const float32x4_t vw89AB = vld1q_f32(weights + 8);
  const float32x4_t vwCDEF = vld1q_f32(weights + 12);
  const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
  const float32x4_t vwKLMN = vld1q_f32(weights + 20);
  const float32x2_t vwOP = vld1_f32(weights + 24);

  const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));

  const float* i0 = zero;
  const float* i1 = zero;
  const float* i2 = input;
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);

  float* o0 = output;

  size_t output_height = input_height;
  do {
    if XNN_UNPREDICTABLE(output_height < 2) {
      i3 = zero;
    }
    if XNN_UNPREDICTABLE(output_height < 3) {
      i4 = zero;
    }

    float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
    float32x4_t vi4x0123 = vmovq_n_f32(0.0f);

    float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
    float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
    float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
    float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
    float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;

    size_t w = input_width;
    for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);

      const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
      const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
      const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
      const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
      const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0);

      const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
      vi0x0123 = vi0x4567;
      const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
      vi1x0123 = vi1x4567;
      const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
      vi2x0123 = vi2x4567;
      const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
      vi3x0123 = vi3x4567;
      const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
      vi4x0123 = vi4x4567;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);

      const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
      const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
      const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
      const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
      const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0);

      const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
      vi0x4567 = vi0x89AB;
      const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
      vi1x4567 = vi1x89AB;
      const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
      vi2x4567 = vi2x89AB;
      const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
      vi3x4567 = vi3x89AB;
      const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
      vi4x4567 = vi4x89AB;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      vst1q_f32(o0, vo0); o0 += 4;
    }
    // Always process the last block of 5..8 pixels.
    if XNN_LIKELY(w > 4 * sizeof(float)) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
      float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
      float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
      float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
      float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;

      vi0x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x89AB)));
      vi1x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x89AB)));
      vi2x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x89AB)));
      vi3x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x89AB)));
      vi4x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x89AB)));

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);

      const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
      const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
      const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
      const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
      const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0);

      const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
      vi0x0123 = vi0x4567;
      const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
      vi1x0123 = vi1x4567;
      const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
      vi2x0123 = vi2x4567;
      const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
      vi3x0123 = vi3x4567;
      const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
      vi4x0123 = vi4x4567;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);

      const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
      const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
      const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
      const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
      const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0);

      const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
      vi0x4567 = vi0x89AB;
      const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
      vi1x4567 = vi1x89AB;
      const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
      vi2x4567 = vi2x89AB;
      const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
      vi3x4567 = vi3x89AB;
      const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
      vi4x4567 = vi4x89AB;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      vst1q_f32(o0, vo0); o0 += 4;

      w -= 4 * sizeof(float);
    }
    assert(w >= 1 * sizeof(float));
    assert(w <= 4 * sizeof(float));
    {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
      vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
      vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
      vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));
      vi4x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x4567)));

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);

      const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
      const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
      const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
      const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
      const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0);

      const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
      const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
      const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
      const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
      const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);

      const float32x4_t vzero = vmovq_n_f32(0.0f);
      const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
      const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
      const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
      const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);
      const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0);

      const float32x4_t vi0x6789 = vextq_f32(vi0x5678, vzero, 1);
      const float32x4_t vi1x6789 = vextq_f32(vi1x5678, vzero, 1);
      const float32x4_t vi2x6789 = vextq_f32(vi2x5678, vzero, 1);
      const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1);
      const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      if XNN_LIKELY(w & (4 * sizeof(float))) {
        vst1q_f32(o0, vo0); o0 += 4;
      } else {
        float32x2_t vo0_lo = vget_low_f32(vo0);
        if (w & (2 * sizeof(float))) {
          vst1_f32(o0, vo0_lo); o0 += 2;

          vo0_lo = vget_high_f32(vo0);
        }
        if (w & (1 * sizeof(float))) {
          vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
        }
      }
    }

    i0 = (const float*) ((uintptr_t) i1 - input_decrement);
    i1 = (const float*) ((uintptr_t) i2 - input_decrement);
    i2 = (const float*) ((uintptr_t) i1 + input_width);
    i3 = (const float*) ((uintptr_t) i2 + input_width);
    i4 = (const float*) ((uintptr_t) i3 + input_width);


  } while (--output_height != 0);
}

void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4(
    size_t input_height,
    size_t input_width,
    const float* input,
    const float* weights,
    const float* zero,
    float* output,
    uint32_t padding_top,
    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(input_height != 0);
  assert(input_width != 0);
  assert(input_width % sizeof(float) == 0);
  assert(padding_top >= 1);
  assert(padding_top <= 2);

  const uint32x4_t vmask_even = vld1q_u32(params->neon_stride2.mask_even);
  const uint32x4_t vmask_odd = vld1q_u32(params->neon_stride2.mask_odd);
  const float32x4_t vmax = vld1q_dup_f32(&params->neon_stride2.max);
  const float32x4_t vmin = vld1q_dup_f32(&params->neon_stride2.min);

  const float32x4_t vw0123 = vld1q_f32(weights);
  const float32x4_t vw4567 = vld1q_f32(weights + 4);
  const float32x4_t vw89AB = vld1q_f32(weights + 8);
  const float32x4_t vwCDEF = vld1q_f32(weights + 12);
  const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
  const float32x4_t vwKLMN = vld1q_f32(weights + 20);
  const float32x2_t vwOP   = vld1_f32(weights + 24);

  const uint32_t padding_top_less_1 = padding_top - 1;
  const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));

  const float* i0 = zero;
  const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
  const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
  if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
    i1 = zero;
  }
  const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
  const float* i4 = (const float*) ((uintptr_t) i3 + input_width);


  float* o0 = output;

  size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
  size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
  do {
    if XNN_UNPREDICTABLE(padded_input_height < 6) {
      i3 = zero;
    }
    if XNN_UNPREDICTABLE(padded_input_height < 7) {
      i4 = zero;
    }

    float32x4_t vi0x0246 = vmovq_n_f32(0.0f);
    float32x4_t vi1x0246 = vmovq_n_f32(0.0f);
    float32x4_t vi2x0246 = vmovq_n_f32(0.0f);
    float32x4_t vi3x0246 = vmovq_n_f32(0.0f);
    float32x4_t vi4x0246 = vmovq_n_f32(0.0f);

    float32x4_t vi0x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi1x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi2x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi3x1357 = vmovq_n_f32(0.0f);
    float32x4_t vi4x1357 = vmovq_n_f32(0.0f);

    float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); i0 += 8;
    float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); i1 += 8;
    float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); i2 += 8;
    float32x4x2_t vi3x8ACE9BDF = vld2q_f32(i3); i3 += 8;
    float32x4x2_t vi4x8ACE9BDF = vld2q_f32(i4); i4 += 8;

    size_t w = input_width;
    for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[1], vwOP, 0);

      const float32x4_t vi0x68AC = vextq_f32(vi0x0246, vi0x8ACE9BDF.val[0], 3);
      vi0x0246 = vi0x8ACE9BDF.val[0];
      const float32x4_t vi1x68AC = vextq_f32(vi1x0246, vi1x8ACE9BDF.val[0], 3);
      vi1x0246 = vi1x8ACE9BDF.val[0];
      const float32x4_t vi2x68AC = vextq_f32(vi2x0246, vi2x8ACE9BDF.val[0], 3);
      vi2x0246 = vi2x8ACE9BDF.val[0];
      const float32x4_t vi3x68AC = vextq_f32(vi3x0246, vi3x8ACE9BDF.val[0], 3);
      vi3x0246 = vi3x8ACE9BDF.val[0];
      const float32x4_t vi4x68AC = vextq_f32(vi4x0246, vi4x8ACE9BDF.val[0], 3);
      vi4x0246 = vi4x8ACE9BDF.val[0];

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x68AC, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x68AC, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x68AC, vget_high_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x68AC, vget_low_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x68AC, vget_low_f32(vwKLMN), 1);

      const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x8ACE9BDF.val[1], 3);
      vi0x1357 = vi0x8ACE9BDF.val[1];
      const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x8ACE9BDF.val[1], 3);
      vi1x1357 = vi1x8ACE9BDF.val[1];
      const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x8ACE9BDF.val[1], 3);
      vi2x1357 = vi2x8ACE9BDF.val[1];
      const float32x4_t vi3x79BD = vextq_f32(vi3x1357, vi3x8ACE9BDF.val[1], 3);
      vi3x1357 = vi3x8ACE9BDF.val[1];
      const float32x4_t vi4x79BD = vextq_f32(vi4x1357, vi4x8ACE9BDF.val[1], 3);
      vi4x1357 = vi4x8ACE9BDF.val[1];

      const float32x4x2_t vi0xGIKMHJLN = vld2q_f32(i0); i0 += 8;
      const float32x4x2_t vi1xGIKMHJLN = vld2q_f32(i1); i1 += 8;
      const float32x4x2_t vi2xGIKMHJLN = vld2q_f32(i2); i2 += 8;
      const float32x4x2_t vi3xGIKMHJLN = vld2q_f32(i3); i3 += 8;
      const float32x4x2_t vi4xGIKMHJLN = vld2q_f32(i4); i4 += 8;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_low_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x79BD, vget_low_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x79BD, vget_high_f32(vwKLMN), 0);

      const float32x4_t vi0xACEG = vextq_f32(vi0x8ACE9BDF.val[0], vi0xGIKMHJLN.val[0], 1);
      vi0x8ACE9BDF = vi0xGIKMHJLN;
      const float32x4_t vi1xACEG = vextq_f32(vi1x8ACE9BDF.val[0], vi1xGIKMHJLN.val[0], 1);
      vi1x8ACE9BDF = vi1xGIKMHJLN;
      const float32x4_t vi2xACEG = vextq_f32(vi2x8ACE9BDF.val[0], vi2xGIKMHJLN.val[0], 1);
      vi2x8ACE9BDF = vi2xGIKMHJLN;
      const float32x4_t vi3xACEG = vextq_f32(vi3x8ACE9BDF.val[0], vi3xGIKMHJLN.val[0], 1);
      vi3x8ACE9BDF = vi3xGIKMHJLN;
      const float32x4_t vi4xACEG = vextq_f32(vi4x8ACE9BDF.val[0], vi4xGIKMHJLN.val[0], 1);
      vi4x8ACE9BDF = vi4xGIKMHJLN;

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0xACEG, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1xACEG, vget_high_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2xACEG, vget_high_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3xACEG, vget_low_f32(vwKLMN), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4xACEG, vwOP, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      vst1q_f32(o0, vo0); o0 += 4;
    }
    // Last block has 1-8 pixels to process.
    assert(w <= 8 * sizeof(float));
    assert(w >= 1 * sizeof(float));
    {
      float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);

      const float32x4_t vi0x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[0])));
      const float32x4_t vi1x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[0])));
      const float32x4_t vi2x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[0])));
      const float32x4_t vi3x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi3x8ACE9BDF.val[0])));
      const float32x4_t vi4x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi4x8ACE9BDF.val[0])));

      const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[1])));
      const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[1])));
      const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[1])));
      const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3x8ACE9BDF.val[1])));
      const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4x8ACE9BDF.val[1])));

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE, vget_high_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE, vget_low_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE, vget_low_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE, vget_high_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE, vget_high_f32(vwKLMN), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x9BDF, vget_low_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x9BDF, vget_low_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x9BDF, vget_high_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x9BDF, vget_high_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x9BDF, vwOP, 0);

      const float32x4_t vi0x68AC = vextq_f32(vi0x0246, vi0x8ACE, 3);
      const float32x4_t vi1x68AC = vextq_f32(vi1x0246, vi1x8ACE, 3);
      const float32x4_t vi2x68AC = vextq_f32(vi2x0246, vi2x8ACE, 3);
      const float32x4_t vi3x68AC = vextq_f32(vi3x0246, vi3x8ACE, 3);
      const float32x4_t vi4x68AC = vextq_f32(vi4x0246, vi4x8ACE, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x68AC, vget_low_f32(vw0123), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x68AC, vget_high_f32(vw4567), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x68AC, vget_high_f32(vw89AB), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x68AC, vget_low_f32(vwGHIJ), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x68AC, vget_low_f32(vwKLMN), 1);

      const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x9BDF, 3);
      const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x9BDF, 3);
      const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x9BDF, 3);
      const float32x4_t vi3x79BD = vextq_f32(vi3x1357, vi3x9BDF, 3);
      const float32x4_t vi4x79BD = vextq_f32(vi4x1357, vi4x9BDF, 3);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_high_f32(vw0123), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_high_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_low_f32(vwCDEF), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3x79BD, vget_low_f32(vwGHIJ), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4x79BD, vget_high_f32(vwKLMN), 0);

      const float32x4_t vzero = vmovq_n_f32(0.0f);
      const float32x4_t vi0xACEG = vextq_f32(vi0x8ACE, vzero, 1);
      const float32x4_t vi1xACEG = vextq_f32(vi1x8ACE, vzero, 1);
      const float32x4_t vi2xACEG = vextq_f32(vi2x8ACE, vzero, 1);
      const float32x4_t vi3xACEG = vextq_f32(vi3x8ACE, vzero, 1);
      const float32x4_t vi4xACEG = vextq_f32(vi4x8ACE, vzero, 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi0xACEG, vget_low_f32(vw4567), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi1xACEG, vget_high_f32(vw89AB), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi2xACEG, vget_high_f32(vwCDEF), 1);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi3xACEG, vget_low_f32(vwKLMN), 0);

      vo0p0 = vmlaq_lane_f32(vo0p0, vi4xACEG, vwOP, 1);


      float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);

      vo0 = vminq_f32(vo0, vmax);

      size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
      if XNN_LIKELY(w_tmp >= 4) {
        vst1q_f32(o0, vo0); o0 += 4;
      } else {
        float32x2_t vo0_lo = vget_low_f32(vo0);
        if (w_tmp & 2) {
          vst1_f32(o0, vo0_lo); o0 += 2;

          vo0_lo = vget_high_f32(vo0);
        }
        if (w_tmp & 1) {
          vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
        }
      }
    }

    i0 = (const float*) ((uintptr_t) i2 - input_decrement);
    i1 = (const float*) ((uintptr_t) i3 - input_decrement);
    i2 = (const float*) ((uintptr_t) i4 - input_decrement);
    i3 = (const float*) ((uintptr_t) i2 + input_width);
    i4 = (const float*) ((uintptr_t) i3 + input_width);


    output_height -= 1;
    padded_input_height -= 2;
  } while (output_height != 0);
}

void xnn_f32_f16_vcvt_ukernel__neon_x8(
    size_t batch,
    const float* input,
    void* output,
    const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint32x4_t vexp_bias = vld1q_dup_u32(&params->neon.exp_bias);
  const float32x4_t vscale_to_inf = vld1q_dup_f32(&params->neon.scale_to_inf);
  const uint32x4_t vexpw_max = vld1q_dup_u32(&params->neon.expw_max);
  const float32x4_t vscale_to_zero = vld1q_dup_f32(&params->neon.scale_to_zero);
  const uint32x4_t vbias_min = vdupq_n_u32(UINT32_C(0x40000000));
  const uint16x8_t vexph_mask = vdupq_n_u16(UINT16_C(0x7C00));
  const uint16x8_t vmanth_mask = vdupq_n_u16(UINT16_C(0x0FFF));
  const uint16x8_t vsignh_mask = vdupq_n_u16(UINT16_C(0x8000));
  const uint16x8_t vnanh = vdupq_n_u16(UINT16_C(0x7E00));

  uint16_t* o = (uint16_t*) output;
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0 = vld1q_f32(input); input += 4;
    const float32x4_t vx1 = vld1q_f32(input); input += 4;

    const float32x4_t vabsx0 = vabsq_f32(vx0);
    const float32x4_t vabsx1 = vabsq_f32(vx1);

    uint32x4_t vbias0 = vaddq_u32(vreinterpretq_u32_f32(vabsx0), vexp_bias);
    uint32x4_t vbias1 = vaddq_u32(vreinterpretq_u32_f32(vabsx1), vexp_bias);

    float32x4_t vf0 = vmulq_f32(vabsx0, vscale_to_inf);
    float32x4_t vf1 = vmulq_f32(vabsx1, vscale_to_inf);
    const uint32x4_t vnanmaskw0 = vcgtq_u32(vreinterpretq_u32_f32(vabsx0), vexpw_max);
    const uint32x4_t vnanmaskw1 = vcgtq_u32(vreinterpretq_u32_f32(vabsx1), vexpw_max);

    vbias0 = vandq_u32(vbias0, vexpw_max);
    vbias1 = vandq_u32(vbias1, vexpw_max);
    vf0 = vmulq_f32(vf0, vscale_to_zero);
    vf1 = vmulq_f32(vf1, vscale_to_zero);

    const uint16x8_t vnanmaskh0 = vcombine_u16(vmovn_u32(vnanmaskw0), vmovn_u32(vnanmaskw1));
    vbias0 = vmaxq_u32(vbias0, vbias_min);
    vbias1 = vmaxq_u32(vbias1, vbias_min);

    vf0 = vaddq_f32(vf0, vreinterpretq_f32_u32(vbias0));
    vf1 = vaddq_f32(vf1, vreinterpretq_f32_u32(vbias1));

    uint16x8_t vexph0 = vcombine_u16(vshrn_n_u32(vreinterpretq_u32_f32(vf0), 13), vshrn_n_u32(vreinterpretq_u32_f32(vf1), 13));
    uint16x8_t vmanth0 = vcombine_u16(vmovn_u32(vreinterpretq_u32_f32(vf0)), vmovn_u32(vreinterpretq_u32_f32(vf1)));
    uint16x8_t vsignh0 = vcombine_u16(vshrn_n_u32(vreinterpretq_u32_f32(vx0), 16), vshrn_n_u32(vreinterpretq_u32_f32(vx1), 16));

    vexph0 = vandq_u16(vexph0, vexph_mask);
    vmanth0 = vandq_u16(vmanth0, vmanth_mask);
    vsignh0 = vandq_u16(vsignh0, vsignh_mask);

    uint16x8_t vh0 = vaddq_u16(vmanth0, vexph0);

    vh0 = vbslq_u16(vnanmaskh0, vnanh, vh0);

    vh0 = vorrq_u16(vh0, vsignh0);

    vst1q_u16(o, vh0); o += 8;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;

    const float32x4_t vabsx = vabsq_f32(vx);

    uint32x4_t vbias = vaddq_u32(vreinterpretq_u32_f32(vabsx), vexp_bias);

    float32x4_t vf = vmulq_f32(vabsx, vscale_to_inf);
    const uint32x4_t vnanmaskw = vcgtq_u32(vreinterpretq_u32_f32(vabsx), vexpw_max);

    vbias = vandq_u32(vbias, vexpw_max);
    vf = vmulq_f32(vf, vscale_to_zero);

    const uint16x4_t vnanmaskh = vmovn_u32(vnanmaskw);
    vbias = vmaxq_u32(vbias, vbias_min);

    vf = vaddq_f32(vf, vreinterpretq_f32_u32(vbias));

    uint16x4_t vexph = vshrn_n_u32(vreinterpretq_u32_f32(vf), 13);
    uint16x4_t vmanth = vmovn_u32(vreinterpretq_u32_f32(vf));
    uint16x4_t vsignh = vshrn_n_u32(vreinterpretq_u32_f32(vx), 16);

    vexph = vand_u16(vexph, vget_low_u16(vexph_mask));
    vmanth = vand_u16(vmanth, vget_low_u16(vmanth_mask));
    vsignh = vand_u16(vsignh, vget_low_u16(vsignh_mask));

    uint16x4_t vh = vadd_u16(vmanth, vexph);

    vh = vbsl_u16(vnanmaskh, vget_low_u16(vnanh), vh);

    vh = vorr_u16(vh, vsignh);

    vst1_u16(o, vh); o += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch % sizeof(float) == 0);
    assert(batch >= 1 * sizeof(float));
    assert(batch <= 3 * sizeof(float));
    const float32x4_t vx = vld1q_f32(input);

    const float32x4_t vabsx = vabsq_f32(vx);

    uint32x4_t vbias = vaddq_u32(vreinterpretq_u32_f32(vabsx), vexp_bias);

    float32x4_t vf = vmulq_f32(vabsx, vscale_to_inf);
    const uint32x4_t vnanmaskw = vcgtq_u32(vreinterpretq_u32_f32(vabsx), vexpw_max);

    vbias = vandq_u32(vbias, vexpw_max);
    vf = vmulq_f32(vf, vscale_to_zero);

    const uint16x4_t vnanmaskh = vmovn_u32(vnanmaskw);
    vbias = vmaxq_u32(vbias, vbias_min);

    vf = vaddq_f32(vf, vreinterpretq_f32_u32(vbias));

    uint16x4_t vexph = vshrn_n_u32(vreinterpretq_u32_f32(vf), 13);
    uint16x4_t vmanth = vmovn_u32(vreinterpretq_u32_f32(vf));
    uint16x4_t vsignh = vshrn_n_u32(vreinterpretq_u32_f32(vx), 16);

    vexph = vand_u16(vexph, vget_low_u16(vexph_mask));
    vmanth = vand_u16(vmanth, vget_low_u16(vmanth_mask));
    vsignh = vand_u16(vsignh, vget_low_u16(vsignh_mask));

    uint16x4_t vh = vadd_u16(vmanth, vexph);

    vh = vbsl_u16(vnanmaskh, vget_low_u16(vnanh), vh);

    vh = vorr_u16(vh, vsignh);

    if (batch & (2 * sizeof(float))) {
      vst1_lane_u32((void*) o, vreinterpret_u32_u16(vh), 0); o += 2;
      vh = vext_u16(vh, vh, 2);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_u16(o, vh, 0);
    }
  }
}

void xnn_f32_gavgpool_cw_ukernel__neon_x4(
    size_t elements,
    size_t channels,
    const float* input,
    float* output,
    const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(elements != 0);
  assert(elements % sizeof(float) == 0);
  assert(channels != 0);

  const uint32x4_t vmask = vld1q_u32(params->neon.mask);
  const float32x2_t vmultiplier = vld1_dup_f32(&params->neon.multiplier);
  const float32x2_t voutput_min = vld1_dup_f32(&params->neon.output_min);
  const float32x2_t voutput_max = vld1_dup_f32(&params->neon.output_max);

  do {
    float32x4_t vsum0 = vmovq_n_f32(0.0f);
    size_t n = elements;
    if (n >= 16 * sizeof(float)) {
      float32x4_t vsum1 = vmovq_n_f32(0.0f);
      do {
        const float32x4_t vi0 = vld1q_f32(input);
        const float32x4_t vi1 = vld1q_f32(input + 4);
        const float32x4_t vi2 = vld1q_f32(input + 8);
        const float32x4_t vi3 = vld1q_f32(input + 12);
        input += 16;
        const float32x4_t acc0 = vaddq_f32(vi0, vi1);
        const float32x4_t acc1 = vaddq_f32(vi2, vi3);
        vsum0 = vaddq_f32(vsum0, acc0);
        vsum1 = vaddq_f32(vsum1, acc1);
        n -= 16 * sizeof(float);
      } while (n >= 32 * sizeof(float));
      vsum0 = vaddq_f32(vsum0, vsum1);
    }

    while (n >= 4 * sizeof(float)) {
      const float32x4_t vi0 = vld1q_f32(input);
      input += 4;
      vsum0 = vaddq_f32(vsum0, vi0);
      n -= 4 * sizeof(float);
    }

    if XNN_UNLIKELY(n != 0) {
      float32x4_t vi0 = vld1q_f32(input); input = (const float*) ((uintptr_t) input + n);

      vi0 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0)));

      vsum0 = vaddq_f32(vsum0, vi0);
    }

    const float32x2_t vout2 = vpadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0));
    const float32x2_t vout1 = vpadd_f32(vout2, vout2);

    float32x2_t vout = vmul_f32(vout1, vmultiplier);
    vout = vmax_f32(vout, voutput_min);
    vout = vmin_f32(vout, voutput_max);

    vst1_lane_f32(output, vout, 0); output += 1;
  } while (--channels != 0);
}

void xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4(
    size_t rows,
    size_t channels,
    const float* input,
    size_t input_stride,
    const float* zero,
    float* buffer,
    float* output,
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows > 7);
  assert(channels != 0);

  const float* i0 = input;
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
  const size_t packed_channels = round_up_po2(channels, 4);
  const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float);

  float* b = buffer;
  for (size_t c = 0; c < channels; c += 4) {
    const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
    const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
    const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
    const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
    const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
    const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
    const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;

    const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
    const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
    const float32x4_t vsum45 = vaddq_f32(vi4, vi5);

    const float32x4_t vsum016 = vaddq_f32(vsum01, vi6);
    const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);

    const float32x4_t vsum = vaddq_f32(vsum016, vsum2345);

    vst1q_f32(b, vsum); b += 4;
  }
  for (rows -= 7; rows > 7; rows -= 7) {
    b = buffer;

    i0 = (const float*) ((uintptr_t) i0 + input_increment);
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
    i2 = (const float*) ((uintptr_t) i2 + input_increment);
    i3 = (const float*) ((uintptr_t) i3 + input_increment);
    i4 = (const float*) ((uintptr_t) i4 + input_increment);
    i5 = (const float*) ((uintptr_t) i5 + input_increment);
    i6 = (const float*) ((uintptr_t) i6 + input_increment);

    for (size_t c = 0; c < channels; c += 4) {
      const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vacc = vld1q_f32(b);

      const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
      const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
      const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
      const float32x4_t vsum6a = vaddq_f32(vi6, vacc);

      const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23);
      const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a);

      const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a);

      vst1q_f32(b, vsum); b += 4;
    }
  }

  i0 = (const float*) ((uintptr_t) i0 + input_increment);
  i1 = (const float*) ((uintptr_t) i1 + input_increment);
  if (rows < 2) {
    i1 = zero;
  }
  i2 = (const float*) ((uintptr_t) i2 + input_increment);
  if (rows <= 2) {
    i2 = zero;
  }
  i3 = (const float*) ((uintptr_t) i3 + input_increment);
  if (rows < 4) {
    i3 = zero;
  }
  i4 = (const float*) ((uintptr_t) i4 + input_increment);
  if (rows <= 4) {
    i4 = zero;
  }
  i5 = (const float*) ((uintptr_t) i5 + input_increment);
  if (rows < 6) {
    i5 = zero;
  }
  i6 = (const float*) ((uintptr_t) i6 + input_increment);
  if (rows <= 6) {
    i6 = zero;
  }
  const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);

  b = buffer;
  while (channels >= 4) {
    const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
    const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
    const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
    const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
    const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
    const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
    const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
    const float32x4_t vacc = vld1q_f32(b); b += 4;

    const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
    const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
    const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
    const float32x4_t vsum6a = vaddq_f32(vi6, vacc);

    const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23);
    const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a);

    const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a);

    float32x4_t vout = vmulq_f32(vsum, vscale);
    vout = vmaxq_f32(vout, vmin);
    vout = vminq_f32(vout, vmax);

    vst1q_f32(output, vout); output += 4;

    channels -= 4;
  }
  if (channels != 0) {
    const float32x4_t vi0 = vld1q_f32(i0);
    const float32x4_t vi1 = vld1q_f32(i1);
    const float32x4_t vi2 = vld1q_f32(i2);
    const float32x4_t vi3 = vld1q_f32(i3);
    const float32x4_t vi4 = vld1q_f32(i4);
    const float32x4_t vi5 = vld1q_f32(i5);
    const float32x4_t vi6 = vld1q_f32(i6);
    const float32x4_t vacc = vld1q_f32(b);

    const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
    const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
    const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
    const float32x4_t vsum6a = vaddq_f32(vi6, vacc);

    const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23);
    const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a);

    const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a);

    float32x4_t vout = vmulq_f32(vsum, vscale);
    vout = vmaxq_f32(vout, vmin);
    vout = vminq_f32(vout, vmax);

    float32x2_t vout_lo = vget_low_f32(vout);
    if (channels & 2) {
      vst1_f32(output, vout_lo); output += 2;
      vout_lo = vget_high_f32(vout);
    }
    if (channels & 1) {
      vst1_lane_f32(output, vout_lo, 0);
    }
  }
}

void xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4(
    size_t rows,
    size_t channels,
    const float* input,
    size_t input_stride,
    const float* zero,
    float* output,
    const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows != 0);
  assert(rows <= 7);
  assert(channels != 0);

  const float* i0 = input;
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
  if (rows < 2) {
    i1 = zero;
  }
  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
  if (rows <= 2) {
    i2 = zero;
  }
  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
  if (rows < 4) {
    i3 = zero;
  }
  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
  if (rows <= 4) {
    i4 = zero;
  }
  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
  if (rows < 6) {
    i5 = zero;
  }
  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
  if (rows <= 6) {
    i6 = zero;
  }
  const float32x4_t vscale = vld1q_dup_f32(&params->scalar.scale);
  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);

  while (channels >= 4) {
    const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
    const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
    const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
    const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
    const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
    const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
    const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;

    const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
    const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
    const float32x4_t vsum45 = vaddq_f32(vi4, vi5);

    const float32x4_t vsum016 = vaddq_f32(vsum01, vi6);
    const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);

    const float32x4_t vsum = vaddq_f32(vsum016, vsum2345);

    float32x4_t vout = vmulq_f32(vsum, vscale);
    vout = vmaxq_f32(vout, vmin);
    vout = vminq_f32(vout, vmax);

    vst1q_f32(output, vout); output += 4;

    channels -= 4;
  }
  if (channels != 0) {
    const float32x4_t vi0 = vld1q_f32(i0);
    const float32x4_t vi1 = vld1q_f32(i1);
    const float32x4_t vi2 = vld1q_f32(i2);
    const float32x4_t vi3 = vld1q_f32(i3);
    const float32x4_t vi4 = vld1q_f32(i4);
    const float32x4_t vi5 = vld1q_f32(i5);
    const float32x4_t vi6 = vld1q_f32(i6);

    const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
    const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
    const float32x4_t vsum45 = vaddq_f32(vi4, vi5);

    const float32x4_t vsum016 = vaddq_f32(vsum01, vi6);
    const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);

    const float32x4_t vsum = vaddq_f32(vsum016, vsum2345);

    float32x4_t vout = vmulq_f32(vsum, vscale);
    vout = vmaxq_f32(vout, vmin);
    vout = vminq_f32(vout, vmax);

    float32x2_t vout_lo = vget_low_f32(vout);
    if (channels & 2) {
      vst1_f32(output, vout_lo); output += 2;
      vout_lo = vget_high_f32(vout);
    }
    if (channels & 1) {
      vst1_lane_f32(output, vout_lo, 0);
    }
  }
}

void xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;

      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;

      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567 = vld1q_f32(w); w += 4;

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
    }
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }

  do {
    float32x2_t vacc0x01 = vld1_f32(w); w += 2;
    float32x2_t vacc1x01 = vacc0x01;
    float32x2_t vacc2x01 = vacc0x01;
    float32x2_t vacc3x01 = vacc0x01;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
      const float32x2_t va3 = vld1_f32(a3); a3 += 2;

      const float32x4_t vb01c01 = vld1q_f32(w); w += 4;
      const float32x2_t vb01c0 = vget_low_f32(vb01c01);
      const float32x2_t vb01c1 = vget_high_f32(vb01c01);
      vacc0x01 = vmla_lane_f32(vacc0x01, vb01c0, va0, 0);
      vacc1x01 = vmla_lane_f32(vacc1x01, vb01c0, va1, 0);
      vacc2x01 = vmla_lane_f32(vacc2x01, vb01c0, va2, 0);
      vacc3x01 = vmla_lane_f32(vacc3x01, vb01c0, va3, 0);
      vacc0x01 = vmla_lane_f32(vacc0x01, vb01c1, va0, 1);
      vacc1x01 = vmla_lane_f32(vacc1x01, vb01c1, va1, 1);
      vacc2x01 = vmla_lane_f32(vacc2x01, vb01c1, va2, 1);
      vacc3x01 = vmla_lane_f32(vacc3x01, vb01c1, va3, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x2_t va0 = vld1_dup_f32(a0); a0 += 1;
      const float32x2_t va1 = vld1_dup_f32(a1); a1 += 1;
      const float32x2_t va2 = vld1_dup_f32(a2); a2 += 1;
      const float32x2_t va3 = vld1_dup_f32(a3); a3 += 1;

      const float32x2_t vb01 = vld1_f32(w); w += 2;

      vacc0x01 = vmla_f32(vacc0x01, va0, vb01);
      vacc1x01 = vmla_f32(vacc1x01, va1, vb01);
      vacc2x01 = vmla_f32(vacc2x01, va2, vb01);
      vacc3x01 = vmla_f32(vacc3x01, va3, vb01);
    }

    const float32x2_t vmax = vld1_dup_f32(&params->scalar.max);
    vacc0x01 = vmin_f32(vacc0x01, vmax);
    vacc1x01 = vmin_f32(vacc1x01, vmax);
    vacc2x01 = vmin_f32(vacc2x01, vmax);
    vacc3x01 = vmin_f32(vacc3x01, vmax);
    const float32x2_t vmin = vld1_dup_f32(&params->scalar.min);
    vacc0x01 = vmax_f32(vacc0x01, vmin);
    vacc1x01 = vmax_f32(vacc1x01, vmin);
    vacc2x01 = vmax_f32(vacc2x01, vmin);
    vacc3x01 = vmax_f32(vacc3x01, vmin);

    if XNN_LIKELY(nc >= 2) {
      vst1_f32(c0, vacc0x01);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);
      vst1_f32(c1, vacc1x01);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1_f32(c2, vacc2x01);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1_f32(c3, vacc3x01);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);

      a0 = (const float*) ((uintptr_t) a0 - kc);
      a1 = (const float*) ((uintptr_t) a1 - kc);
      a2 = (const float*) ((uintptr_t) a2 - kc);
      a3 = (const float*) ((uintptr_t) a3 - kc);

      nc -= 2;
    } else {
      assert(nc == 1);
      vst1_lane_f32(c0, vacc0x01, 0);
      vst1_lane_f32(c1, vacc1x01, 0);
      vst1_lane_f32(c2, vacc2x01, 0);
      vst1_lane_f32(c3, vacc3x01, 0);

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t k = kc;
    if XNN_LIKELY(k >= 4 * sizeof(float)) {
      do {
        const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
        const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
        const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
        const float32x4_t va3 = vld1q_f32(a3); a3 += 4;


        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);

        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);

        const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);

        const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
        k -= 4 * sizeof(float);
      } while (k >= 4 * sizeof(float));
    }

    if XNN_UNLIKELY(k != 0) {
      if XNN_UNLIKELY(k & (2 * sizeof(float))) {
        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
        const float32x2_t va3 = vld1_f32(a3); a3 += 2;


        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);

        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
      }
      if XNN_UNLIKELY(k & (1 * sizeof(float))) {
        const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
        const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
        const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
        const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;

        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
      }
    }
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a3 = (const float*) ((uintptr_t) a3 - kc);
      a2 = (const float*) ((uintptr_t) a2 - kc);
      a1 = (const float*) ((uintptr_t) a1 - kc);
      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
      const float32x2_t va3 = vld1_f32(a3); a3 += 2;

      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;

      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
      const float32x4_t vb4567 = vld1q_f32(w); w += 4;

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
    }
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a3 = (const float*) ((uintptr_t) a3 - kc);
      a2 = (const float*) ((uintptr_t) a2 - kc);
      a1 = (const float*) ((uintptr_t) a1 - kc);
      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_ibilinear_chw_ukernel__neon_p8(
    size_t output_pixels,
    size_t channels,
    const float** restrict input,
    size_t input_offset,
    const float* restrict weights,
    float* restrict output,
    size_t input_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);
  assert(input_increment % sizeof(float) == 0);

  do {
    const float** i = input;
    const float* w = weights;
    size_t p = output_pixels;
    for (; p >= 8; p -= 8) {
      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
      const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
      const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
      const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
      const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
      const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
      const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
      const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
      const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
      i += 2 * 8;

      const float32x4x2_t vw0123 = vld2q_f32(w + 0);
      const float32x4x2_t vw4567 = vld2q_f32(w + 8);
      w += 2 * 8;

      const float32x2_t vtltr0 = vld1_f32(itl0);
      const float32x2_t vblbr0 = vld1_f32(ibl0);
      const float32x2_t vtltr1 = vld1_f32(itl1);
      const float32x2_t vblbr1 = vld1_f32(ibl1);
      const float32x2_t vtltr2 = vld1_f32(itl2);
      const float32x2_t vblbr2 = vld1_f32(ibl2);
      const float32x2_t vtltr3 = vld1_f32(itl3);
      const float32x2_t vblbr3 = vld1_f32(ibl3);
      const float32x2_t vtltr4 = vld1_f32(itl4);
      const float32x2_t vblbr4 = vld1_f32(ibl4);
      const float32x2_t vtltr5 = vld1_f32(itl5);
      const float32x2_t vblbr5 = vld1_f32(ibl5);
      const float32x2_t vtltr6 = vld1_f32(itl6);
      const float32x2_t vblbr6 = vld1_f32(ibl6);
      const float32x2_t vtltr7 = vld1_f32(itl7);
      const float32x2_t vblbr7 = vld1_f32(ibl7);

      const float32x4_t valphah0123 = vw0123.val[0];
      const float32x4_t valphav0123 = vw0123.val[1];
      const float32x4_t valphah4567 = vw4567.val[0];
      const float32x4_t valphav4567 = vw4567.val[1];

      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
      const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5);
      const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5);
      const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7);
      const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7);

      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
      const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45);
      const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67);

      const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23);
      const float32x4_t vld0123 = vld_t0123.val[0];
      const float32x4_t vrd0123 = vld_t0123.val[1];
      const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67);
      const float32x4_t vld4567 = vld_t4567.val[0];
      const float32x4_t vrd4567 = vld_t4567.val[1];

      const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23);
      const float32x4_t vtl0123 = vtl_t0123.val[0];
      const float32x4_t vtr0123 = vtl_t0123.val[1];
      const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67);
      const float32x4_t vtl4567 = vtl_t4567.val[0];
      const float32x4_t vtr4567 = vtl_t4567.val[1];

      const float32x4_t vl0123 = vmlaq_f32(vtl0123, vld0123, valphav0123);
      const float32x4_t vr0123 = vmlaq_f32(vtr0123, vrd0123, valphav0123);
      const float32x4_t vl4567 = vmlaq_f32(vtl4567, vld4567, valphav4567);
      const float32x4_t vr4567 = vmlaq_f32(vtr4567, vrd4567, valphav4567);

      const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123);
      const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567);

      const float32x4_t vo0123 = vmlaq_f32(vl0123, vd0123, valphah0123);
      const float32x4_t vo4567 = vmlaq_f32(vl4567, vd4567, valphah4567);

      vst1q_f32(output + 0, vo0123);
      vst1q_f32(output + 4, vo4567);
      output += 8;
    }

    for (; p >= 4; p -= 4) {
      const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
      const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
      const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
      const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
      const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
      const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
      const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
      const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
      i += 8;

      const float32x4x2_t vw = vld2q_f32(w);
      w += 8;

      const float32x2_t vtltr0 = vld1_f32(itl0);
      const float32x2_t vblbr0 = vld1_f32(ibl0);
      const float32x2_t vtltr1 = vld1_f32(itl1);
      const float32x2_t vblbr1 = vld1_f32(ibl1);
      const float32x2_t vtltr2 = vld1_f32(itl2);
      const float32x2_t vblbr2 = vld1_f32(ibl2);
      const float32x2_t vtltr3 = vld1_f32(itl3);
      const float32x2_t vblbr3 = vld1_f32(ibl3);

      const float32x4_t valphah = vw.val[0];
      const float32x4_t valphav = vw.val[1];

      const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
      const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
      const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
      const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);

      const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
      const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);

      const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
      const float32x4_t vld = vld_t.val[0];
      const float32x4_t vrd = vld_t.val[1];

      const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
      const float32x4_t vtl = vtl_t.val[0];
      const float32x4_t vtr = vtl_t.val[1];

      const float32x4_t vl = vmlaq_f32(vtl, vld, valphav);
      const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav);

      const float32x4_t vd = vsubq_f32(vr, vl);
      const float32x4_t vo = vmlaq_f32(vl, vd, valphah);

      vst1q_f32(output, vo);
      output += 4;
    }

    if XNN_UNLIKELY(p != 0) {
      if (p & 2) {
        const float32x2x2_t vw = vld2_f32(w);
        w += 4;

        const float32x2_t valphah = vw.val[0];
        const float32x2_t valphav = vw.val[1];

        const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
        const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
        const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
        const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
        i += 4;

        const float32x2_t vtltr0 = vld1_f32(itl0);
        const float32x2_t vblbr0 = vld1_f32(ibl0);
        const float32x2_t vtltr1 = vld1_f32(itl1);
        const float32x2_t vblbr1 = vld1_f32(ibl1);

        const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
        const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);

        const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
        const float32x2_t vld = vld_t.val[0];
        const float32x2_t vrd = vld_t.val[1];

        const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
        const float32x2_t vtl = vtl_t.val[0];
        const float32x2_t vtr = vtl_t.val[1];

        const float32x2_t vl = vmla_f32(vtl, vld, valphav);
        const float32x2_t vr = vmla_f32(vtr, vrd, valphav);

        const float32x2_t vd = vsub_f32(vr, vl);
        const float32x2_t vo = vmla_f32(vl, vd, valphah);

        vst1_f32(output, vo);
        output += 2;
      }

      if (p & 1) {
        // We are computing the following formula:
        //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
        //                 alpha_h  * (1 - alpha_v) * top_right +
        //            (1 - alpha_h) *      alpha_v  * bottom_left +
        //                 alpha_h  *      alpha_v  * bottom_right.
        //
        // Rearranging gives
        //   result =    left + alpha_h * (right        - left),
        // where
        //   left =  top_left + alpha_v * (bottom_left  - top_left),
        //  right = top_right + alpha_v * (bottom_right - top_right).

        const float alphah = *w;
        const float32x2_t valphav = vld1_dup_f32(w + 1);
        w += 2;

        const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
        const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
        i += 2;

        const float32x2_t vtltr = vld1_f32(itl);
        const float32x2_t vblbr = vld1_f32(ibl);

        // Compute at once
        //    left_diff = bottom_left  - top_left
        //   right_diff = bottom_right - top_right
        const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
        const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav);

        // Extract them and compute the result.
        const float l = vget_lane_f32(vlr, 0);
        const float r = vget_lane_f32(vlr, 1);

        *output++ = l + alphah * (r - l);
      }
    }

    input_offset += input_increment;
  } while (--channels != 0);
}

void xnn_f32_ibilinear_ukernel__neon_c8(
    size_t output_pixels,
    size_t channels,
    const float** restrict input,
    size_t input_offset,
    const float* restrict weights,
    float* restrict output,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);
  assert(channels % sizeof(float) == 0);

  do {
    const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
    const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
    const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
    const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
    input += 4;

    const float32x2_t valphahv = vld1_f32(weights); weights += 2;

    size_t c = channels;
    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
      const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vtl4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vtr4567 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vbl4567 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vbr4567 = vld1q_f32(i3); i3 += 4;

      const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
      const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);
      const float32x4_t vtd4567 = vsubq_f32(vtr4567, vtl4567);
      const float32x4_t vbd4567 = vsubq_f32(vbr4567, vbl4567);

      const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
      const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0);
      const float32x4_t vt4567 = vmlaq_lane_f32(vtl4567, vtd4567, valphahv, 0);
      const float32x4_t vb4567 = vmlaq_lane_f32(vbl4567, vbd4567, valphahv, 0);

      const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);
      const float32x4_t vd4567 = vsubq_f32(vb4567, vt4567);

      const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1);
      const float32x4_t vo4567 = vmlaq_lane_f32(vt4567, vd4567, valphahv, 1);

      vst1q_f32(output, vo0123); output += 4;
      vst1q_f32(output, vo4567); output += 4;
    }
    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
      const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4;

      const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
      const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);

      const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
      const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0);

      const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);

      const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1);

      vst1q_f32(output, vo0123);
      output += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      const float32x4_t vtl0123 = vld1q_f32(i0);
      const float32x4_t vtr0123 = vld1q_f32(i1);
      const float32x4_t vbl0123 = vld1q_f32(i2);
      const float32x4_t vbr0123 = vld1q_f32(i3);

      const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
      const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);

        const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
        const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0);

      const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);

      const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1);

      float32x2_t vo01 = vget_low_f32(vo0123);
      if (c & (2 * sizeof(float))) {
        vst1_f32(output, vo01); output += 2;
        vo01 = vget_high_f32(vo0123);
      }
      if (c & (1 * sizeof(float))) {
        vst1_lane_f32(output, vo01, 0); output += 1;
      }
    }

    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const float** restrict a,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  float* c0 = c;

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;

    size_t p = ks;
    do {
      const float* restrict a0 = a[0];
      assert(a0 != NULL);
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
        const float32x2_t va0 = vld1_f32(a0); a0 += 2;

        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);

      }
      if XNN_UNLIKELY(k != 0) {
        const float32x4_t va0 = vld1q_dup_f32(a0);

        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a = (const float**restrict) ((uintptr_t) a - ks);
      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const float** restrict a,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(ks != 0);
  assert(ks % (4 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  float* c0 = c;
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    c1 = c0;
  }
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    c2 = c1;
  }
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    c3 = c2;
  }

  do {
    float32x2_t vacc0x01 = vld1_f32(w); w += 2;
    float32x2_t vacc1x01 = vacc0x01;
    float32x2_t vacc2x01 = vacc0x01;
    float32x2_t vacc3x01 = vacc0x01;

    size_t p = ks;
    do {
      const float* restrict a0 = a[0];
      assert(a0 != NULL);
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
      }
      const float* restrict a1 = a[1];
      assert(a1 != NULL);
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
      }
      const float* restrict a2 = a[2];
      assert(a2 != NULL);
      if XNN_UNPREDICTABLE(a2 != zero) {
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
      }
      const float* restrict a3 = a[3];
      assert(a3 != NULL);
      if XNN_UNPREDICTABLE(a3 != zero) {
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
      }
      a += 4;

      size_t k = kc;
      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
        const float32x2_t va3 = vld1_f32(a3); a3 += 2;

        const float32x2_t vb01c0 = vld1_f32(w); w += 2;

        vacc0x01 = vmla_lane_f32(vacc0x01, vb01c0, va0, 0);
        vacc1x01 = vmla_lane_f32(vacc1x01, vb01c0, va1, 0);
        vacc2x01 = vmla_lane_f32(vacc2x01, vb01c0, va2, 0);
        vacc3x01 = vmla_lane_f32(vacc3x01, vb01c0, va3, 0);
        const float32x2_t vb01c1 = vld1_f32(w); w += 2;

        vacc0x01 = vmla_lane_f32(vacc0x01, vb01c1, va0, 1);
        vacc1x01 = vmla_lane_f32(vacc1x01, vb01c1, va1, 1);
        vacc2x01 = vmla_lane_f32(vacc2x01, vb01c1, va2, 1);
        vacc3x01 = vmla_lane_f32(vacc3x01, vb01c1, va3, 1);
      }
      if XNN_UNLIKELY(k != 0) {
        const float32x2_t va0 = vld1_dup_f32(a0);
        const float32x2_t va1 = vld1_dup_f32(a1);
        const float32x2_t va2 = vld1_dup_f32(a2);
        const float32x2_t va3 = vld1_dup_f32(a3);

        const float32x2_t vb01 = vld1_f32(w); w += 2;

        vacc0x01 = vmla_f32(vacc0x01, va0, vb01);
        vacc1x01 = vmla_f32(vacc1x01, va1, vb01);
        vacc2x01 = vmla_f32(vacc2x01, va2, vb01);
        vacc3x01 = vmla_f32(vacc3x01, va3, vb01);
      }
      p -= 4 * sizeof(void*);
    } while (p != 0);

    const float32x2_t vmax = vld1_dup_f32(&params->scalar.max);
    vacc0x01 = vmin_f32(vacc0x01, vmax);
    vacc1x01 = vmin_f32(vacc1x01, vmax);
    vacc2x01 = vmin_f32(vacc2x01, vmax);
    vacc3x01 = vmin_f32(vacc3x01, vmax);

    const float32x2_t vmin = vld1_dup_f32(&params->scalar.min);
    vacc0x01 = vmax_f32(vacc0x01, vmin);
    vacc1x01 = vmax_f32(vacc1x01, vmin);
    vacc2x01 = vmax_f32(vacc2x01, vmin);
    vacc3x01 = vmax_f32(vacc3x01, vmin);

    if XNN_LIKELY(nc >= 2) {
      vst1_f32(c3, vacc3x01);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1_f32(c2, vacc2x01);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1_f32(c1, vacc1x01);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1_f32(c0, vacc0x01);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a = (const float**restrict) ((uintptr_t) a - ks);
      nc -= 2;
    } else {
      assert(nc == 1);
      vst1_lane_f32(c3, vacc3x01, 0);
      vst1_lane_f32(c2, vacc2x01, 0);
      vst1_lane_f32(c1, vacc1x01, 0);
      vst1_lane_f32(c0, vacc0x01, 0);

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const float** restrict a,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(ks != 0);
  assert(ks % (4 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  float* c0 = c;
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    c1 = c0;
  }
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    c2 = c1;
  }
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    c3 = c2;
  }

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t p = ks;
    do {
      const float* restrict a0 = a[0];
      assert(a0 != NULL);
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
      }
      const float* restrict a1 = a[1];
      assert(a1 != NULL);
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
      }
      const float* restrict a2 = a[2];
      assert(a2 != NULL);
      if XNN_UNPREDICTABLE(a2 != zero) {
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
      }
      const float* restrict a3 = a[3];
      assert(a3 != NULL);
      if XNN_UNPREDICTABLE(a3 != zero) {
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
      }
      a += 4;

      size_t k = kc;
      for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
        const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
        const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
        const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
        const float32x4_t va3 = vld1q_f32(a3); a3 += 4;


        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);

        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);

        const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);

        const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
      }
      if XNN_UNLIKELY(k != 0) {
        do {
          const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
          const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
          const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
          const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;

          const float32x4_t vb0123 = vld1q_f32(w); w += 4;
          const float32x4_t vb4567 = vld1q_f32(w); w += 4;

          vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
          vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
          vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
          vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
          vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
          vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
          vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
          vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);

          k -= sizeof(float);
        } while (k != 0);
      }

      p -= 4 * sizeof(void*);
    } while (p != 0);

    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a = (const float**restrict) ((uintptr_t) a - ks);
      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const float** restrict a,
    const float* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const float* zero,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(ks != 0);
  assert(ks % (4 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  float* c0 = c;
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    c1 = c0;
  }
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    c2 = c1;
  }
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    c3 = c2;
  }

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t p = ks;
    do {
      const float* restrict a0 = a[0];
      assert(a0 != NULL);
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const float*) ((uintptr_t) a0 + a_offset);
      }
      const float* restrict a1 = a[1];
      assert(a1 != NULL);
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const float*) ((uintptr_t) a1 + a_offset);
      }
      const float* restrict a2 = a[2];
      assert(a2 != NULL);
      if XNN_UNPREDICTABLE(a2 != zero) {
        a2 = (const float*) ((uintptr_t) a2 + a_offset);
      }
      const float* restrict a3 = a[3];
      assert(a3 != NULL);
      if XNN_UNPREDICTABLE(a3 != zero) {
        a3 = (const float*) ((uintptr_t) a3 + a_offset);
      }
      a += 4;

      size_t k = kc;
      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
        const float32x2_t va3 = vld1_f32(a3); a3 += 2;

        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);

      }
      if XNN_UNLIKELY(k != 0) {
        const float32x4_t va0 = vld1q_dup_f32(a0);
        const float32x4_t va1 = vld1q_dup_f32(a1);
        const float32x4_t va2 = vld1q_dup_f32(a2);
        const float32x4_t va3 = vld1q_dup_f32(a3);

        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
        const float32x4_t vb4567 = vld1q_f32(w); w += 4;

        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
      }
      p -= 4 * sizeof(void*);
    } while (p != 0);

    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a = (const float**restrict) ((uintptr_t) a - ks);
      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    float* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(channels != 0);

  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  do {
    float* o = output;
    {
      const float* i0 = *input++;
      const float* i1 = *input++;
      const float* i2 = *input++;
      const float* i3 = *input++;
      const float* i4 = *input++;
      const float* i5 = *input++;
      const float* i6 = *input++;
      const float* i7 = *input++;
      const float* i8 = *input++;
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
      if (kernel_elements < 2) {
        i1 = i0;
      }
      if (kernel_elements <= 2) {
        i2 = i0;
      }
      if (kernel_elements < 4) {
        i3 = i0;
      }
      if (kernel_elements <= 4) {
        i4 = i0;
      }
      if (kernel_elements < 6) {
        i5 = i0;
      }
      if (kernel_elements <= 6) {
        i6 = i0;
      }
      if (kernel_elements < 8) {
        i7 = i0;
      }
      if (kernel_elements <= 8) {
        i8 = i0;
      }

      size_t c = channels;
      for (; c >= 4; c -= 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

        const float32x4_t vmax018 = vmaxq_f32(vmaxq_f32(vi0, vi1), vi8);
        const float32x4_t vmax23 = vmaxq_f32(vi2, vi3);
        const float32x4_t vmax45 = vmaxq_f32(vi4, vi5);
        const float32x4_t vmax67 = vmaxq_f32(vi6, vi7);

        const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45);
        const float32x4_t vmax01678 = vmaxq_f32(vmax018, vmax67);
        const float32x4_t vmax = vmaxq_f32(vmax2345, vmax01678);
        const float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min);

        vst1q_f32(o, vout); o += 4;
      }
      if (c != 0) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

        const float32x4_t vmax018 = vmaxq_f32(vmaxq_f32(vi0, vi1), vi8);
        const float32x4_t vmax23 = vmaxq_f32(vi2, vi3);
        const float32x4_t vmax45 = vmaxq_f32(vi4, vi5);
        const float32x4_t vmax67 = vmaxq_f32(vi6, vi7);

        const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45);
        const float32x4_t vmax01678 = vmaxq_f32(vmax018, vmax67);
        const float32x4_t vmax = vmaxq_f32(vmax2345, vmax01678);
        float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min);

        float32x2_t vout_lo = vget_low_f32(vout);
        if (c & 2) {
          vst1_f32(o, vout_lo); o += 2;
          vout_lo = vget_high_f32(vout);
        }
        if (c & 1) {
          vst1_lane_f32(o, vout_lo, 0); o += 1;
        }
      }
    }

    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
      const float* i0 = *input++;
      const float* i1 = *input++;
      const float* i2 = *input++;
      const float* i3 = *input++;
      const float* i4 = *input++;
      const float* i5 = *input++;
      const float* i6 = *input++;
      const float* i7 = *input++;
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
      if (k < 2) {
        i1 = i0;
      }
      if (k <= 2) {
        i2 = i0;
      }
      if (k < 4) {
        i3 = i0;
      }
      if (k <= 4) {
        i4 = i0;
      }
      if (k < 6) {
        i5 = i0;
      }
      if (k <= 6) {
        i6 = i0;
      }
      if (k < 8) {
        i7 = i0;
      }

      o = output;
      size_t c = channels;
      for (; c >= 4; c -= 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vo = vld1q_f32(o);

        const float32x4_t vmax01 = vmaxq_f32(vmaxq_f32(vi0, vi1), vo);
        const float32x4_t vmax23 = vmaxq_f32(vi2, vi3);
        const float32x4_t vmax45 = vmaxq_f32(vi4, vi5);
        const float32x4_t vmax67 = vmaxq_f32(vi6, vi7);

        const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45);
        const float32x4_t vmax0167 = vmaxq_f32(vmax01, vmax67);
        const float32x4_t vmax = vmaxq_f32(vmax2345, vmax0167);
        const float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min);

        vst1q_f32(o, vout); o += 4;
      }
      if (c != 0) {
        const float32x4_t vi0 = vld1q_f32(i0);
        const float32x4_t vi1 = vld1q_f32(i1);
        const float32x4_t vi2 = vld1q_f32(i2);
        const float32x4_t vi3 = vld1q_f32(i3);
        const float32x4_t vi4 = vld1q_f32(i4);
        const float32x4_t vi5 = vld1q_f32(i5);
        const float32x4_t vi6 = vld1q_f32(i6);
        const float32x4_t vi7 = vld1q_f32(i7);
        const float32x4_t vo = vld1q_f32(o);

        const float32x4_t vmax01 = vmaxq_f32(vmaxq_f32(vi0, vi1), vo);
        const float32x4_t vmax23 = vmaxq_f32(vi2, vi3);
        const float32x4_t vmax45 = vmaxq_f32(vi4, vi5);
        const float32x4_t vmax67 = vmaxq_f32(vi6, vi7);

        const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45);
        const float32x4_t vmax0167 = vmaxq_f32(vmax01, vmax67);
        const float32x4_t vmax = vmaxq_f32(vmax2345, vmax0167);
        float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min);

        float32x2_t vout_lo = vget_low_f32(vout);
        if (c & 2) {
          vst1_f32(o, vout_lo); o += 2;
          vout_lo = vget_high_f32(vout);
        }
        if (c & 1) {
          vst1_lane_f32(o, vout_lo, 0); o += 1;
        }
      }
    }
    input = (const float**) ((uintptr_t) input + input_increment);
    output = (float*) ((uintptr_t) o + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    const float* zero,
    const float* multiplier,
    float* buffer,
    float* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements > 9);
  assert(channels != 0);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);

  do {
    {
      const float* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }
      const float* i8 = *input++;
      assert(i8 != NULL);
      if XNN_UNPREDICTABLE(i8 != zero) {
        i8 = (const float*) ((uintptr_t) i8 + input_offset);
      }

      float* b = buffer;
      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

        vst1q_f32(b, vsum); b += 4;
      }
    }

    size_t k = kernel_elements;
    for (k -= 9; k > 8; k -= 8) {
      const float* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      const float* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      const float* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      const float* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      const float* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      const float* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      const float* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      const float* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }

      float* b = buffer;
      for (size_t c = 0; c < channels; c += 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vacc = vld1q_f32(b);

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        vst1q_f32(b, vsum); b += 4;
      }
    }

    {
      const float* i0 = input[0];
      assert(i0 != NULL);
      const float* i1 = input[1];
      const float* i2 = input[2];
      const float* i3 = input[3];
      const float* i4 = input[4];
      const float* i5 = input[5];
      const float* i6 = input[6];
      const float* i7 = input[7];
      input = (const float**) ((uintptr_t) input + input_increment);
      if (k < 2) {
        i1 = zero;
      }
      assert(i1 != NULL);
      if (k <= 2) {
        i2 = zero;
      }
      assert(i2 != NULL);
      if (k < 4) {
        i3 = zero;
      }
      assert(i3 != NULL);
      if (k <= 4) {
        i4 = zero;
      }
      assert(i4 != NULL);
      if (k < 6) {
        i5 = zero;
      }
      assert(i5 != NULL);
      if (k <= 6) {
        i6 = zero;
      }
      assert(i6 != NULL);
      if (k < 8) {
        i7 = zero;
      }
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const float*) ((uintptr_t) i0 + input_offset);
      }
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const float*) ((uintptr_t) i1 + input_offset);
      }
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const float*) ((uintptr_t) i2 + input_offset);
      }
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const float*) ((uintptr_t) i3 + input_offset);
      }
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const float*) ((uintptr_t) i4 + input_offset);
      }
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const float*) ((uintptr_t) i5 + input_offset);
      }
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const float*) ((uintptr_t) i6 + input_offset);
      }
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const float*) ((uintptr_t) i7 + input_offset);
      }

      const float32x4_t vmultiplier = vld1q_dup_f32(multiplier); multiplier += 1;

      size_t c = channels;
      float* b = buffer;
      while (c >= 4) {
        const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
        const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
        const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
        const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
        const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
        const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
        const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
        const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
        const float32x4_t vacc = vld1q_f32(b); b += 4;

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        float32x4_t vout = vmulq_f32(vsum, vmultiplier);
        vout = vmaxq_f32(vout, voutput_min);
        vout = vminq_f32(vout, voutput_max);

        vst1q_f32(output, vout); output += 4;

        c -= 4;
      }
      if (c != 0) {
        const float32x4_t vi0 = vld1q_f32(i0);
        const float32x4_t vi1 = vld1q_f32(i1);
        const float32x4_t vi2 = vld1q_f32(i2);
        const float32x4_t vi3 = vld1q_f32(i3);
        const float32x4_t vi4 = vld1q_f32(i4);
        const float32x4_t vi5 = vld1q_f32(i5);
        const float32x4_t vi6 = vld1q_f32(i6);
        const float32x4_t vi7 = vld1q_f32(i7);
        const float32x4_t vacc = vld1q_f32(b);

        const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
        const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
        const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
        const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
        const float32x4_t vsum01a = vaddq_f32(vsum01, vacc);
        const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
        const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67);
        const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a);

        float32x4_t vout = vmulq_f32(vsum, vmultiplier);
        vout = vmaxq_f32(vout, voutput_min);
        vout = vminq_f32(vout, voutput_max);

        float32x2_t vout_lo = vget_low_f32(vout);
        if (c & 2) {
          vst1_f32(output, vout_lo); output += 2;
          vout_lo = vget_high_f32(vout);
        }
        if (c & 1) {
          vst1_lane_f32(output, vout_lo, 0); output += 1;
        }
      }
    }
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const float** input,
    size_t input_offset,
    const float* zero,
    const float* multiplier,
    float* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(kernel_elements <= 9);
  assert(channels != 0);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);

  do {
    const float* i0 = input[0];
    assert(i0 != NULL);
    const float* i1 = input[1];
    const float* i2 = input[2];
    const float* i3 = input[3];
    const float* i4 = input[4];
    const float* i5 = input[5];
    const float* i6 = input[6];
    const float* i7 = input[7];
    const float* i8 = input[8];
    input = (const float**) ((uintptr_t) input + input_increment);
    if (kernel_elements < 2) {
      i1 = zero;
    }
    assert(i1 != NULL);
    if (kernel_elements <= 2) {
      i2 = zero;
    }
    assert(i2 != NULL);
    if (kernel_elements < 4) {
      i3 = zero;
    }
    assert(i3 != NULL);
    if (kernel_elements <= 4) {
      i4 = zero;
    }
    assert(i4 != NULL);
    if (kernel_elements < 6) {
      i5 = zero;
    }
    assert(i5 != NULL);
    if (kernel_elements <= 6) {
      i6 = zero;
    }
    assert(i6 != NULL);
    if (kernel_elements < 8) {
      i7 = zero;
    }
    assert(i7 != NULL);
    if (kernel_elements <= 8) {
      i8 = zero;
    }
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const float*) ((uintptr_t) i0 + input_offset);
    }
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const float*) ((uintptr_t) i1 + input_offset);
    }
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const float*) ((uintptr_t) i2 + input_offset);
    }
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const float*) ((uintptr_t) i3 + input_offset);
    }
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const float*) ((uintptr_t) i4 + input_offset);
    }
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const float*) ((uintptr_t) i5 + input_offset);
    }
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const float*) ((uintptr_t) i6 + input_offset);
    }
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const float*) ((uintptr_t) i7 + input_offset);
    }
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const float*) ((uintptr_t) i8 + input_offset);
    }

    const float32x4_t vmultiplier = vld1q_dup_f32(multiplier); multiplier += 1;

    size_t c = channels;
    while (c >= 4) {
      const float32x4_t vi0 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi2 = vld1q_f32(i2); i2 += 4;
      const float32x4_t vi3 = vld1q_f32(i3); i3 += 4;
      const float32x4_t vi4 = vld1q_f32(i4); i4 += 4;
      const float32x4_t vi5 = vld1q_f32(i5); i5 += 4;
      const float32x4_t vi6 = vld1q_f32(i6); i6 += 4;
      const float32x4_t vi7 = vld1q_f32(i7); i7 += 4;
      const float32x4_t vi8 = vld1q_f32(i8); i8 += 4;

      const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
      const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
      const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
      const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
      const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
      const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
      const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
      const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

      float32x4_t vout = vmulq_f32(vsum, vmultiplier);
      vout = vmaxq_f32(vout, voutput_min);
      vout = vminq_f32(vout, voutput_max);

      vst1q_f32(output, vout); output += 4;

      c -= 4;
    }
    if (c != 0) {
      const float32x4_t vi0 = vld1q_f32(i0);
      const float32x4_t vi1 = vld1q_f32(i1);
      const float32x4_t vi2 = vld1q_f32(i2);
      const float32x4_t vi3 = vld1q_f32(i3);
      const float32x4_t vi4 = vld1q_f32(i4);
      const float32x4_t vi5 = vld1q_f32(i5);
      const float32x4_t vi6 = vld1q_f32(i6);
      const float32x4_t vi7 = vld1q_f32(i7);
      const float32x4_t vi8 = vld1q_f32(i8);

      const float32x4_t vsum01 = vaddq_f32(vi0, vi1);
      const float32x4_t vsum23 = vaddq_f32(vi2, vi3);
      const float32x4_t vsum45 = vaddq_f32(vi4, vi5);
      const float32x4_t vsum67 = vaddq_f32(vi6, vi7);
      const float32x4_t vsum018 = vaddq_f32(vsum01, vi8);
      const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45);
      const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67);
      const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678);

      float32x4_t vout = vmulq_f32(vsum, vmultiplier);
      vout = vmaxq_f32(vout, voutput_min);
      vout = vminq_f32(vout, voutput_max);

      float32x2_t vout_lo = vget_low_f32(vout);
      if (c & 2) {
        vst1_f32(output, vout_lo); output += 2;
        vout_lo = vget_high_f32(vout);
      }
      if (c & 1) {
        vst1_lane_f32(output, vout_lo, 0); output += 1;
      }
    }
    output = (float*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_f32_prelu_ukernel__neon_2x8(
    size_t rows,
    size_t channels,
    const float* restrict input,
    size_t input_stride,
    const float* restrict weights,
    float* restrict output,
    size_t output_stride) XNN_OOB_READS
{
  assert(rows != 0);
  assert(channels != 0);
  assert(channels % sizeof(float) == 0);

  const float* i0 = input;
  float* o0 = output;
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
  float* o1 = (float*) ((uintptr_t) o0 + output_stride);

  const size_t input_increment = input_stride * 2 - channels;
  const size_t output_increment = output_stride * 2 - channels;

  do {
    if XNN_UNPREDICTABLE(rows < 2) {
      i1 = i0;
      o1 = o0;
    }

    const float* w = weights;
    size_t c = channels;
    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
      const float32x4_t vw0123 = vld1q_f32(w); w += 4;
      const float32x4_t vw4567 = vld1q_f32(w); w += 4;

      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;

      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
      float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567);
      const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0));
      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));
      float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567);
      const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0));

      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
      vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567);
      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);
      vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567);

      vst1q_f32(o0, vacc0x0123); o0 += 4;
      vst1q_f32(o0, vacc0x4567); o0 += 4;
      vst1q_f32(o1, vacc1x0123); o1 += 4;
      vst1q_f32(o1, vacc1x4567); o1 += 4;
    }
    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
      const float32x4_t vw0123 = vld1q_f32(w); w += 4;

      const float32x4_t vi0x0123 = vld1q_f32(i0);
      i0 += 4;
      const float32x4_t vi1x0123 = vld1q_f32(i1);
      i1 += 4;

      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));

      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);

      vst1q_f32(o0, vacc0x0123); o0 += 4;
      vst1q_f32(o1, vacc1x0123); o1 += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      const float32x4_t vw0123 = vld1q_f32(w); w += 4;

      const float32x4_t vi0x0123 = vld1q_f32(i0);
      i0 = (const float*) ((uintptr_t) i0 + c);
      const float32x4_t vi1x0123 = vld1q_f32(i1);
      i1 = (const float*) ((uintptr_t) i1 + c);

      float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123);
      const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0));
      float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123);
      const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0));

      vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123);
      vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123);

      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      if (c & (2 * sizeof(float))) {
        vst1_f32(o0, vacc0x01); o0 += 2;
        vst1_f32(o1, vacc1x01); o1 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
      }
      if (c & (1 * sizeof(float))) {
        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
      }
    }
    i0 = (const float*) ((uintptr_t) i0 + input_increment);
    o0 = (float*) ((uintptr_t) o0 + output_increment);
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
    o1 = (float*) ((uintptr_t) o1 + output_increment);
    rows = doz(rows, 2);
  } while (rows != 0);
}

void xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const int16x8_t vminus_kernel_zero_point = vld1q_dup_s16(&params->scalar.minus_kernel_zero_point[0]);
  const uint8x8_t vmask = vmov_n_u8(UINT8_C(0xF));

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;

      const uint8x8_t vw01234567c01 = vld1_u8(w); w = (const uint8_t*) w + 8;
      const uint8x8_t vw01234567c0 = vand_u8(vw01234567c01, vmask);
      const uint8x8_t vw01234567c1 = vshr_n_u8(vw01234567c01, 4);
      const int16x8_t vxw01234567c0 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c0));
      const int16x8_t vxw01234567c1 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c1));
      const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0));
      const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0));
      const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1));
      const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1));
      const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0);
      const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1);
      const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0);
      const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1);

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;

      const uint8x8_t vw01234567 = vld1_u8(w); w = (const uint8_t*) w + 8;
      const int16x8_t vxw01234567 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567));
      const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01234567));
      const int32x4_t vxw4567 = vmovl_s16(vget_high_s16(vxw01234567));
      const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123);
      const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567);

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
    }
    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567);
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }
  const int16x8_t vminus_kernel_zero_point = vld1q_dup_s16(&params->scalar.minus_kernel_zero_point[0]);
  const uint8x8_t vmask = vmov_n_u8(UINT8_C(0xF));

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
      const float32x2_t va3 = vld1_f32(a3); a3 += 2;

      const uint8x8_t vw01234567c01 = vld1_u8(w); w = (const uint8_t*) w + 8;
      const uint8x8_t vw01234567c0 = vand_u8(vw01234567c01, vmask);
      const uint8x8_t vw01234567c1 = vshr_n_u8(vw01234567c01, 4);
      const int16x8_t vxw01234567c0 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c0));
      const int16x8_t vxw01234567c1 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c1));
      const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0));
      const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0));
      const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1));
      const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1));
      const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0);
      const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1);
      const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0);
      const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1);

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;

      const uint8x8_t vw01234567 = vld1_u8(w); w = (const uint8_t*) w + 8;
      const int16x8_t vxw01234567 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567));
      const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01234567));
      const int32x4_t vxw4567 = vmovl_s16(vget_high_s16(vxw01234567));
      const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123);
      const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567);

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
    }
    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
    vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123);
    vacc2x0123 = vmulq_f32(vacc2x0123, vscale0123);
    vacc3x0123 = vmulq_f32(vacc3x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567);
    vacc1x4567 = vmulq_f32(vacc1x4567, vscale4567);
    vacc2x4567 = vmulq_f32(vacc2x4567, vscale4567);
    vacc3x4567 = vmulq_f32(vacc3x4567, vscale4567);
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a3 = (const float*) ((uintptr_t) a3 - kc);
      a2 = (const float*) ((uintptr_t) a2 - kc);
      a1 = (const float*) ((uintptr_t) a1 - kc);
      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;

      const int8x8_t vw01234567c0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vw01234567c1 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int16x8_t vxw01234567c0 = vmovl_s8(vw01234567c0);
      const int16x8_t vxw01234567c1 = vmovl_s8(vw01234567c1);
      const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0));
      const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0));
      const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1));
      const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1));
      const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0);
      const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1);
      const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0);
      const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1);

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;

      const int8x8_t vw01230123 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4;
      const int8x8_t vw45674567 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4;
      const int16x8_t vxw01230123 = vmovl_s8(vw01230123);
      const int16x8_t vxw45674567 = vmovl_s8(vw45674567);
      const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01230123));
      const int32x4_t vxw4567 = vmovl_s16(vget_low_s16(vxw45674567));
      const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123);
      const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567);

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
    }
    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567);
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neon_lane_ld64(
    size_t mr,
    size_t nc,
    size_t kc,
    const float* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(float) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const float* a0 = a;
  float* c0 = c;
  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }

  do {
    float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4;
    float32x4_t vacc1x0123 = vacc0x0123;
    float32x4_t vacc1x4567 = vacc0x4567;
    float32x4_t vacc2x0123 = vacc0x0123;
    float32x4_t vacc2x4567 = vacc0x4567;
    float32x4_t vacc3x0123 = vacc0x0123;
    float32x4_t vacc3x4567 = vacc0x4567;

    size_t k = kc;
    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
      const float32x2_t va3 = vld1_f32(a3); a3 += 2;

      const int8x8_t vw01234567c0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vw01234567c1 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int16x8_t vxw01234567c0 = vmovl_s8(vw01234567c0);
      const int16x8_t vxw01234567c1 = vmovl_s8(vw01234567c1);
      const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0));
      const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0));
      const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1));
      const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1));
      const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0);
      const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1);
      const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0);
      const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1);

      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
    }
    if XNN_UNLIKELY(k != 0) {
      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;

      const int8x8_t vw01230123 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4;
      const int8x8_t vw45674567 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4;
      const int16x8_t vxw01230123 = vmovl_s8(vw01230123);
      const int16x8_t vxw45674567 = vmovl_s8(vw45674567);
      const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01230123));
      const int32x4_t vxw4567 = vmovl_s16(vget_low_s16(vxw45674567));
      const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123);
      const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567);

      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
    }
    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
    vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123);
    vacc2x0123 = vmulq_f32(vacc2x0123, vscale0123);
    vacc3x0123 = vmulq_f32(vacc3x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567);
    vacc1x4567 = vmulq_f32(vacc1x4567, vscale4567);
    vacc2x4567 = vmulq_f32(vacc2x4567, vscale4567);
    vacc3x4567 = vmulq_f32(vacc3x4567, vscale4567);
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
    vacc3x4567 = vminq_f32(vacc3x4567, vmax);

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c3, vacc3x0123);
      vst1q_f32(c3 + 4, vacc3x4567);
      c3 = (float*) ((uintptr_t) c3 + cn_stride);
      vst1q_f32(c2, vacc2x0123);
      vst1q_f32(c2 + 4, vacc2x4567);
      c2 = (float*) ((uintptr_t) c2 + cn_stride);
      vst1q_f32(c1, vacc1x0123);
      vst1q_f32(c1 + 4, vacc1x4567);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);
      vst1q_f32(c0, vacc0x0123);
      vst1q_f32(c0 + 4, vacc0x4567);
      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      a3 = (const float*) ((uintptr_t) a3 - kc);
      a2 = (const float*) ((uintptr_t) a2 - kc);
      a1 = (const float*) ((uintptr_t) a1 - kc);
      a0 = (const float*) ((uintptr_t) a0 - kc);

      nc -= 8;

    } else {
      if (nc & 4) {
        vst1q_f32(c3, vacc3x0123); c3 += 4;
        vst1q_f32(c2, vacc2x0123); c2 += 4;
        vst1q_f32(c1, vacc1x0123); c1 += 4;
        vst1q_f32(c0, vacc0x0123); c0 += 4;

        vacc3x0123 = vacc3x4567;
        vacc2x0123 = vacc2x4567;
        vacc1x0123 = vacc1x4567;
        vacc0x0123 = vacc0x4567;
      }
      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      if (nc & 2) {
        vst1_f32(c3, vacc3x01); c3 += 2;
        vst1_f32(c2, vacc2x01); c2 += 2;
        vst1_f32(c1, vacc1x01); c1 += 2;
        vst1_f32(c0, vacc0x01); c0 += 2;

        vacc3x01 = vget_high_f32(vacc3x0123);
        vacc2x01 = vget_high_f32(vacc2x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
        vacc0x01 = vget_high_f32(vacc0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c3, vacc3x01, 0);
        vst1_lane_f32(c2, vacc2x01, 0);
        vst1_lane_f32(c1, vacc1x01, 0);
        vst1_lane_f32(c0, vacc0x01, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_f32_qs8_vcvt_ukernel__neon_x32(
    size_t batch,
    const float* input,
    int8_t* output,
    const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vscale = vld1q_dup_f32(&params->neon.scale);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon.magic_bias);
  const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(&params->neon.magic_bias_less_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
  for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
    float32x4_t vx0123 = vld1q_f32(input); input += 4;
    float32x4_t vx4567 = vld1q_f32(input); input += 4;
    float32x4_t vx89AB = vld1q_f32(input); input += 4;
    float32x4_t vxCDEF = vld1q_f32(input); input += 4;
    float32x4_t vxGHIJ = vld1q_f32(input); input += 4;
    float32x4_t vxKLMN = vld1q_f32(input); input += 4;
    float32x4_t vxOPQR = vld1q_f32(input); input += 4;
    float32x4_t vxSTUV = vld1q_f32(input); input += 4;

    vx0123 = vmulq_f32(vx0123, vscale);
    vx4567 = vmulq_f32(vx4567, vscale);
    vx89AB = vmulq_f32(vx89AB, vscale);
    vxCDEF = vmulq_f32(vxCDEF, vscale);
    vxGHIJ = vmulq_f32(vxGHIJ, vscale);
    vxKLMN = vmulq_f32(vxKLMN, vscale);
    vxOPQR = vmulq_f32(vxOPQR, vscale);
    vxSTUV = vmulq_f32(vxSTUV, vscale);

    vx0123 = vaddq_f32(vx0123, vmagic_bias);
    vx4567 = vaddq_f32(vx4567, vmagic_bias);
    vx89AB = vaddq_f32(vx89AB, vmagic_bias);
    vxCDEF = vaddq_f32(vxCDEF, vmagic_bias);
    vxGHIJ = vaddq_f32(vxGHIJ, vmagic_bias);
    vxKLMN = vaddq_f32(vxKLMN, vmagic_bias);
    vxOPQR = vaddq_f32(vxOPQR, vmagic_bias);
    vxSTUV = vaddq_f32(vxSTUV, vmagic_bias);

    const int32x4_t vacc0123 = vqsubq_s32(vreinterpretq_s32_f32(vx0123), vmagic_bias_less_zero_point);
    const int32x4_t vacc4567 = vqsubq_s32(vreinterpretq_s32_f32(vx4567), vmagic_bias_less_zero_point);
    const int32x4_t vacc89AB = vqsubq_s32(vreinterpretq_s32_f32(vx89AB), vmagic_bias_less_zero_point);
    const int32x4_t vaccCDEF = vqsubq_s32(vreinterpretq_s32_f32(vxCDEF), vmagic_bias_less_zero_point);
    const int32x4_t vaccGHIJ = vqsubq_s32(vreinterpretq_s32_f32(vxGHIJ), vmagic_bias_less_zero_point);
    const int32x4_t vaccKLMN = vqsubq_s32(vreinterpretq_s32_f32(vxKLMN), vmagic_bias_less_zero_point);
    const int32x4_t vaccOPQR = vqsubq_s32(vreinterpretq_s32_f32(vxOPQR), vmagic_bias_less_zero_point);
    const int32x4_t vaccSTUV = vqsubq_s32(vreinterpretq_s32_f32(vxSTUV), vmagic_bias_less_zero_point);

    const int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    const int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    const int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
    const int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));

    int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
    int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));

    vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min);
    vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min);

    vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max);
    vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_s8(output, vy0123456789ABCDEF); output += 16;
    vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16;
  }
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vx_lo = vld1q_f32(input); input += 4;
    float32x4_t vx_hi = vld1q_f32(input); input += 4;

    vx_lo = vmulq_f32(vx_lo, vscale);
    vx_hi = vmulq_f32(vx_hi, vscale);

    vx_lo = vaddq_f32(vx_lo, vmagic_bias);
    vx_hi = vaddq_f32(vx_hi, vmagic_bias);

    const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point);
    const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point);

    const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));

    int8x8_t vy = vqmovn_s16(vacc);
    vy = vmax_s8(vy, vget_low_s8(voutput_min));
    vy = vmin_s8(vy, vget_low_s8(voutput_max));
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(float));
    assert(batch <= 7 * sizeof(float));
    float32x4_t vx_lo = vld1q_f32(input);
    const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float))));
    float32x4_t vx_hi = vld1q_f32(x_hi);

    vx_lo = vmulq_f32(vx_lo, vscale);
    vx_hi = vmulq_f32(vx_hi, vscale);

    vx_lo = vaddq_f32(vx_lo, vmagic_bias);
    vx_hi = vaddq_f32(vx_hi, vmagic_bias);

    const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point);
    const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point);

    const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));

    int8x8_t vy = vqmovn_s16(vacc);
    vy = vmax_s8(vy, vget_low_s8(voutput_min));
    vy = vmin_s8(vy, vget_low_s8(voutput_max));

    if (batch & (4 * sizeof(float))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(float))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_f32_qu8_vcvt_ukernel__neon_x32(
    size_t batch,
    const float* input,
    uint8_t* output,
    const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vscale = vld1q_dup_f32(&params->neon.scale);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon.magic_bias);
  const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(&params->neon.magic_bias_less_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);
  for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) {
    float32x4_t vx0123 = vld1q_f32(input); input += 4;
    float32x4_t vx4567 = vld1q_f32(input); input += 4;
    float32x4_t vx89AB = vld1q_f32(input); input += 4;
    float32x4_t vxCDEF = vld1q_f32(input); input += 4;
    float32x4_t vxGHIJ = vld1q_f32(input); input += 4;
    float32x4_t vxKLMN = vld1q_f32(input); input += 4;
    float32x4_t vxOPQR = vld1q_f32(input); input += 4;
    float32x4_t vxSTUV = vld1q_f32(input); input += 4;

    vx0123 = vmulq_f32(vx0123, vscale);
    vx4567 = vmulq_f32(vx4567, vscale);
    vx89AB = vmulq_f32(vx89AB, vscale);
    vxCDEF = vmulq_f32(vxCDEF, vscale);
    vxGHIJ = vmulq_f32(vxGHIJ, vscale);
    vxKLMN = vmulq_f32(vxKLMN, vscale);
    vxOPQR = vmulq_f32(vxOPQR, vscale);
    vxSTUV = vmulq_f32(vxSTUV, vscale);

    vx0123 = vaddq_f32(vx0123, vmagic_bias);
    vx4567 = vaddq_f32(vx4567, vmagic_bias);
    vx89AB = vaddq_f32(vx89AB, vmagic_bias);
    vxCDEF = vaddq_f32(vxCDEF, vmagic_bias);
    vxGHIJ = vaddq_f32(vxGHIJ, vmagic_bias);
    vxKLMN = vaddq_f32(vxKLMN, vmagic_bias);
    vxOPQR = vaddq_f32(vxOPQR, vmagic_bias);
    vxSTUV = vaddq_f32(vxSTUV, vmagic_bias);

    const int32x4_t vacc0123 = vqsubq_s32(vreinterpretq_s32_f32(vx0123), vmagic_bias_less_zero_point);
    const int32x4_t vacc4567 = vqsubq_s32(vreinterpretq_s32_f32(vx4567), vmagic_bias_less_zero_point);
    const int32x4_t vacc89AB = vqsubq_s32(vreinterpretq_s32_f32(vx89AB), vmagic_bias_less_zero_point);
    const int32x4_t vaccCDEF = vqsubq_s32(vreinterpretq_s32_f32(vxCDEF), vmagic_bias_less_zero_point);
    const int32x4_t vaccGHIJ = vqsubq_s32(vreinterpretq_s32_f32(vxGHIJ), vmagic_bias_less_zero_point);
    const int32x4_t vaccKLMN = vqsubq_s32(vreinterpretq_s32_f32(vxKLMN), vmagic_bias_less_zero_point);
    const int32x4_t vaccOPQR = vqsubq_s32(vreinterpretq_s32_f32(vxOPQR), vmagic_bias_less_zero_point);
    const int32x4_t vaccSTUV = vqsubq_s32(vreinterpretq_s32_f32(vxSTUV), vmagic_bias_less_zero_point);

    const int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    const int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    const int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
    const int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));

    uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
    uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));

    vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min);
    vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min);

    vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max);
    vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_u8(output, vy0123456789ABCDEF); output += 16;
    vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16;
  }
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vx_lo = vld1q_f32(input); input += 4;
    float32x4_t vx_hi = vld1q_f32(input); input += 4;

    vx_lo = vmulq_f32(vx_lo, vscale);
    vx_hi = vmulq_f32(vx_hi, vscale);

    vx_lo = vaddq_f32(vx_lo, vmagic_bias);
    vx_hi = vaddq_f32(vx_hi, vmagic_bias);

    const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point);
    const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point);

    const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));

    uint8x8_t vy = vqmovun_s16(vacc);
    vy = vmax_u8(vy, vget_low_u8(voutput_min));
    vy = vmin_u8(vy, vget_low_u8(voutput_max));
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(float));
    assert(batch <= 7 * sizeof(float));
    float32x4_t vx_lo = vld1q_f32(input);
    const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float))));
    float32x4_t vx_hi = vld1q_f32(x_hi);

    vx_lo = vmulq_f32(vx_lo, vscale);
    vx_hi = vmulq_f32(vx_hi, vscale);

    vx_lo = vaddq_f32(vx_lo, vmagic_bias);
    vx_hi = vaddq_f32(vx_hi, vmagic_bias);

    const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point);
    const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point);

    const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));

    uint8x8_t vy = vqmovun_s16(vacc);
    vy = vmax_u8(vy, vget_low_u8(voutput_min));
    vy = vmin_u8(vy, vget_low_u8(voutput_max));

    if (batch & (4 * sizeof(float))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(float))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64];

void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8(
    size_t batch,
    const float* input,
    const float* max,
    float* output,
    float* sum,
    const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(max != NULL);
  assert(output != NULL);
  assert(sum != NULL);

  const float32x4_t vi_max = vld1q_dup_f32(max);
  const float32x4_t vlog2e = vld1q_dup_f32(&params->neon_rr2_lut64_p2.log2e);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon_rr2_lut64_p2.magic_bias);
  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F));
  const float32x4_t vminus_ln2_hi = vld1q_dup_f32(&params->neon_rr2_lut64_p2.minus_ln2_hi);
  const float32x4_t vminus_ln2_lo = vld1q_dup_f32(&params->neon_rr2_lut64_p2.minus_ln2_lo);
  const float32x4_t vc2 = vld1q_dup_f32(&params->neon_rr2_lut64_p2.c2);
  const float32x4_t vdenorm_cutoff = vld1q_dup_f32(&params->neon_rr2_lut64_p2.denorm_cutoff);

  float32x4_t vacc0 = vmovq_n_f32(0.0f);
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vi0123 = vld1q_f32(input); input += 4;
    const float32x4_t vi4567 = vld1q_f32(input); input += 4;

    const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max);
    const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max);

    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e);
    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e);

    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17);
    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17);

    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);

    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]);
    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]);
    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]);
    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]);

    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1);
    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1);
    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1);
    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1);
    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);

    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));

    vn0123 = vsubq_f32(vn0123, vmagic_bias);
    vn4567 = vsubq_f32(vn4567, vmagic_bias);

    float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi);
    float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi);

    vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo);
    vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo);

    float32x4_t vp0123 = vmulq_f32(vt0123, vc2);
    float32x4_t vp4567 = vmulq_f32(vt4567, vc2);

    vp0123 = vmlaq_f32(vt0123, vt0123, vp0123);
    vp4567 = vmlaq_f32(vt4567, vt4567, vp4567);

    float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123);
    float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567);

    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff)));

    vst1q_f32(output, vf0123); output += 4;
    vst1q_f32(output, vf4567); output += 4;

    vacc0 = vaddq_f32(vacc0, vf0123);
    vacc0 = vaddq_f32(vacc0, vf4567);
  }

  float32x4_t vacc = vacc0;
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vi = vld1q_f32(input); input += 4;

    const float32x4_t vx = vsubq_f32(vi, vi_max);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e);

    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17);

    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]);
    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]);
    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));

    vn = vsubq_f32(vn, vmagic_bias);

    float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi);
    vt = vmlaq_f32(vt, vn, vminus_ln2_lo);

    float32x4_t vp = vmulq_f32(vt, vc2);
    vp = vmlaq_f32(vt, vt, vp);

    float32x4_t vf = vmlaq_f32(vs, vs, vp);

    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));

    vst1q_f32(output, vf); output += 4;

    vacc = vaddq_f32(vacc, vf);
  }
#if XNN_ARCH_ARM64
  float vacc_lo = vaddvq_f32(vacc);
#else
  float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc));
#endif
  if (batch != 0) {
    assert(batch >= 1 * sizeof(float));
    assert(batch <= 3 * sizeof(float));
    const float32x4_t vi = vld1q_f32(input); input += 4;

    const float32x4_t vx = vsubq_f32(vi, vi_max);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e);

    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17);

    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]);
    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]);
    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));

    vn = vsubq_f32(vn, vmagic_bias);

    float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi);
    vt = vmlaq_f32(vt, vn, vminus_ln2_lo);

    float32x4_t vp = vmulq_f32(vt, vc2);
    vp = vmlaq_f32(vt, vt, vp);

    float32x4_t vf = vmlaq_f32(vs, vs, vp);

    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));

    float32x2_t vf_lo = vget_low_f32(vf);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vf_lo); output += 2;

      #if XNN_ARCH_ARM64
        vacc_lo += vaddv_f32(vf_lo);
      #else
        vacc_lo = vadd_f32(vacc_lo, vf_lo);
      #endif

      vf_lo = vget_high_f32(vf);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vf_lo, 0);

      #if XNN_ARCH_ARM64
        vacc_lo += vget_lane_f32(vf_lo, 0);
      #else
        vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32)));
      #endif
    }
  }
#if XNN_ARCH_ARM64
  *sum = vacc_lo;
#else
  vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0);
#endif
}

void xnn_f32_rmax_ukernel__neon(
    size_t batch,
    const float* input,
    float* output)
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  float32x4_t vmax0 = vld1q_dup_f32(input);
  float32x4_t vmax1 = vmax0;
  float32x4_t vmax2 = vmax0;
  float32x4_t vmax3 = vmax0;
  for (; batch >= 64; batch -= 64) {
    const float32x4_t vx0 = vld1q_f32(input); input += 4;
    const float32x4_t vx1 = vld1q_f32(input); input += 4;
    const float32x4_t vx2 = vld1q_f32(input); input += 4;
    const float32x4_t vx3 = vld1q_f32(input); input += 4;

    vmax0 = vmaxq_f32(vmax0, vx0);
    vmax1 = vmaxq_f32(vmax1, vx1);
    vmax2 = vmaxq_f32(vmax2, vx2);
    vmax3 = vmaxq_f32(vmax3, vx3);
  }
  float32x4_t vmax = vmaxq_f32(vmaxq_f32(vmax0, vmax1), vmaxq_f32(vmax2, vmax3));
  for (; batch >= 16; batch -= 16) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    vmax = vmaxq_f32(vmax, vx);
  }
#if XNN_ARCH_ARM64
  float32x2_t vmax_lo = vget_low_f32(vpmaxq_f32(vmax, vmax));
#else
  float32x2_t vmax_lo = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
#endif
  if XNN_UNLIKELY(batch != 0) {
    do {
      const float32x2_t vx = vld1_dup_f32(input); input += 1;
      vmax_lo = vmax_f32(vmax_lo, vx);
      batch -= 4;
    } while (batch != 0);
  }
#if XNN_ARCH_ARM64
  *output = vmaxv_f32(vmax_lo);
#else
  vst1_lane_f32(output, vpmax_f32(vmax_lo, vmax_lo), 0);
#endif
}

void xnn_f32_rminmax_ukernel__neon_x16_acc4(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_default_params* params)
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  float32x4_t vmin0 = vld1q_dup_f32(input);
  float32x4_t vmax0 = vmin0;
  float32x4_t vmin1 = vmin0;
  float32x4_t vmax1 = vmax0;
  float32x4_t vmin2 = vmin0;
  float32x4_t vmax2 = vmax0;
  float32x4_t vmin3 = vmin0;
  float32x4_t vmax3 = vmax0;
  for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
    const float32x4_t vt0 = vld1q_f32(input); input += 4;
    const float32x4_t vt1 = vld1q_f32(input); input += 4;
    const float32x4_t vt2 = vld1q_f32(input); input += 4;
    const float32x4_t vt3 = vld1q_f32(input); input += 4;

    vmin0 = vminq_f32(vmin0, vt0);
    vmax0 = vmaxq_f32(vmax0, vt0);
    vmin1 = vminq_f32(vmin1, vt1);
    vmax1 = vmaxq_f32(vmax1, vt1);
    vmin2 = vminq_f32(vmin2, vt2);
    vmax2 = vmaxq_f32(vmax2, vt2);
    vmin3 = vminq_f32(vmin3, vt3);
    vmax3 = vmaxq_f32(vmax3, vt3);
  }
  vmin0 = vminq_f32(vmin0, vmin1);
  vmax0 = vmaxq_f32(vmax0, vmax1);
  vmin2 = vminq_f32(vmin2, vmin3);
  vmax2 = vmaxq_f32(vmax2, vmax3);
  vmin0 = vminq_f32(vmin0, vmin2);
  vmax0 = vmaxq_f32(vmax0, vmax2);
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vt = vld1q_f32(input); input += 4;
    vmin0 = vminq_f32(vmin0, vt);
    vmax0 = vmaxq_f32(vmax0, vt);
  }
  float32x2_t vmin = vmin_f32(vget_low_f32(vmin0), vget_high_f32(vmin0));
  float32x2_t vmax = vmax_f32(vget_low_f32(vmax0), vget_high_f32(vmax0));
  if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
    const float32x2_t vt = vld1_f32(input); input += 2;
    vmin = vmin_f32(vmin, vt);
    vmax = vmax_f32(vmax, vt);
  }
  vmin = vpmin_f32(vmin, vmin);
  vmax = vpmax_f32(vmax, vmax);
  if XNN_UNLIKELY(batch & (1 * sizeof(float))) {
    const float32x2_t vt = vld1_dup_f32(input);
    vmin = vmin_f32(vmin, vt);
    vmax = vmax_f32(vmax, vt);
  }
  vst1_lane_f32(output, vmin, 0);
  vst1_lane_f32(output + 1, vmax, 0);
}

void xnn_f32_rsum_ukernel__neon_x16_acc4(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  float32x4_t vacc0 = vmovq_n_f32(0.0f);
  float32x4_t vacc1 = vmovq_n_f32(0.0f);
  float32x4_t vacc2 = vmovq_n_f32(0.0f);
  float32x4_t vacc3 = vmovq_n_f32(0.0f);
  for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
    const float32x4_t vt0 = vld1q_f32(input); input += 4;
    const float32x4_t vt1 = vld1q_f32(input); input += 4;
    const float32x4_t vt2 = vld1q_f32(input); input += 4;
    const float32x4_t vt3 = vld1q_f32(input); input += 4;

    vacc0 = vaddq_f32(vacc0, vt0);
    vacc1 = vaddq_f32(vacc1, vt1);
    vacc2 = vaddq_f32(vacc2, vt2);
    vacc3 = vaddq_f32(vacc3, vt3);
  }
  vacc0 = vaddq_f32(vacc0, vacc1);
  vacc2 = vaddq_f32(vacc2, vacc3);
  vacc0 = vaddq_f32(vacc0, vacc2);
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vt = vld1q_f32(input); input += 4;
    vacc0 = vaddq_f32(vacc0, vt);
  }
  const float32x2_t vscale = vld1_dup_f32(&params->scalar.scale);
  float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0));
  if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
    const float32x2_t vt = vld1_f32(input); input += 2;
    vacc = vadd_f32(vacc, vt);
  }
  vacc = vpadd_f32(vacc, vacc);
  if XNN_UNLIKELY(batch & (1 * sizeof(float))) {
    const float32x2_t vt = vld1_dup_f32(input);
    vacc = vadd_f32(vacc, vt);
  }
  vacc = vmul_f32(vacc, vscale);
  vst1_lane_f32(output, vacc, 0);
}

void xnn_f32_spmm_minmax_ukernel_32x1__neon(
    size_t mc,
    size_t nc,
    const float* input,
    const float* weights,
    const int32_t* widx_dmap,
    const uint32_t* nidx_nnzmap,
    float* output,
    size_t output_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
  assert(mc != 0);
  assert(mc % sizeof(float) == 0);
  assert(nc != 0);

  #if XNN_ARCH_ARM64
    const float32x4x2_t vminmax = vld2q_dup_f32(&params->scalar.min);
    const float32x4_t vmin = vminmax.val[0];
    const float32x4_t vmax = vminmax.val[1];
  #else
    const float32x2x2_t vminmax = vld2_dup_f32(&params->scalar.min);
    const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]);
    const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]);
  #endif

  size_t output_decrement = output_stride * nc - 32 * sizeof(float);
  while XNN_LIKELY(mc >= 32 * sizeof(float)) {
    const float* w = weights;
    const int32_t* dmap = widx_dmap;
    const uint32_t* nnzmap = nidx_nnzmap;
    size_t n = nc;
    do {
      uint32_t nnz = *nnzmap++;
      float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
      float32x4_t vacc4567 = vacc0123;
      float32x4_t vacc89AB = vacc0123;
      float32x4_t vaccCDEF = vacc0123;
      float32x4_t vaccGHIJ = vacc0123;
      float32x4_t vaccKLMN = vacc0123;
      float32x4_t vaccOPQR = vacc0123;
      float32x4_t vaccSTUV = vacc0123;
      if XNN_LIKELY(nnz != 0) {
        do {
          const intptr_t diff = *dmap++;
          const float32x4_t vi0123 = vld1q_f32(input);
          const float32x4_t vi4567 = vld1q_f32(input + 4);
          const float32x4_t vi89AB = vld1q_f32(input + 8);
          const float32x4_t viCDEF = vld1q_f32(input + 12);
          const float32x4_t viGHIJ = vld1q_f32(input + 16);
          const float32x4_t viKLMN = vld1q_f32(input + 20);
          const float32x4_t viOPQR = vld1q_f32(input + 24);
          const float32x4_t viSTUV = vld1q_f32(input + 28);
          input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
          xnn_prefetch_to_l1(input + 16);
          xnn_prefetch_to_l1(input + 32);
          const float32x4_t vw = vld1q_dup_f32(w); w += 1;
          xnn_prefetch_to_l1(w + 32);
          vacc0123 = vmlaq_f32(vacc0123, vi0123, vw);
          vacc4567 = vmlaq_f32(vacc4567, vi4567, vw);
          vacc89AB = vmlaq_f32(vacc89AB, vi89AB, vw);
          vaccCDEF = vmlaq_f32(vaccCDEF, viCDEF, vw);
          vaccGHIJ = vmlaq_f32(vaccGHIJ, viGHIJ, vw);
          vaccKLMN = vmlaq_f32(vaccKLMN, viKLMN, vw);
          vaccOPQR = vmlaq_f32(vaccOPQR, viOPQR, vw);
          vaccSTUV = vmlaq_f32(vaccSTUV, viSTUV, vw);
        } while (--nnz != 0);
      }
      float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
      float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
      float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
      float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
      float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax);
      float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax);
      float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax);
      float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax);
      vout0123 = vmaxq_f32(vout0123, vmin);
      vout4567 = vmaxq_f32(vout4567, vmin);
      vout89AB = vmaxq_f32(vout89AB, vmin);
      voutCDEF = vmaxq_f32(voutCDEF, vmin);
      voutGHIJ = vmaxq_f32(voutGHIJ, vmin);
      voutKLMN = vmaxq_f32(voutKLMN, vmin);
      voutOPQR = vmaxq_f32(voutOPQR, vmin);
      voutSTUV = vmaxq_f32(voutSTUV, vmin);
      vst1q_f32(output, vout0123);
      vst1q_f32(output + 4, vout4567);
      vst1q_f32(output + 8, vout89AB);
      vst1q_f32(output + 12, voutCDEF);
      vst1q_f32(output + 16, voutGHIJ);
      vst1q_f32(output + 20, voutKLMN);
      vst1q_f32(output + 24, voutOPQR);
      vst1q_f32(output + 28, voutSTUV);
      output = (float*) ((uintptr_t) output + output_stride);
    } while (--n != 0);
    output = (float*) ((uintptr_t) output - output_decrement);
    input += 32;
    mc -= 32 * sizeof(float);
  }
  if XNN_UNLIKELY(mc != 0) {
    output_decrement += 16 * sizeof(float);
    if (mc & (16 * sizeof(float))) {
      const float* w = weights;
      const int32_t* dmap = widx_dmap;
      const uint32_t* nnzmap = nidx_nnzmap;
      size_t n = nc;
      do {
        uint32_t nnz = *nnzmap++;
        float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
        float32x4_t vacc4567 = vacc0123;
        float32x4_t vacc89AB = vacc0123;
        float32x4_t vaccCDEF = vacc0123;
        if XNN_LIKELY(nnz != 0) {
          do {
            const intptr_t diff = *dmap++;
            const float32x4_t vi0123 = vld1q_f32(input);
            const float32x4_t vi4567 = vld1q_f32(input + 4);
            const float32x4_t vi89AB = vld1q_f32(input + 8);
            const float32x4_t viCDEF = vld1q_f32(input + 12);
            input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
            const float32x4_t vw = vld1q_dup_f32(w); w += 1;
            vacc0123 = vmlaq_f32(vacc0123, vi0123, vw);
            vacc4567 = vmlaq_f32(vacc4567, vi4567, vw);
            vacc89AB = vmlaq_f32(vacc89AB, vi89AB, vw);
            vaccCDEF = vmlaq_f32(vaccCDEF, viCDEF, vw);
          } while (--nnz != 0);
        }
        float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
        float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
        float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
        float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
        vout0123 = vmaxq_f32(vout0123, vmin);
        vout4567 = vmaxq_f32(vout4567, vmin);
        vout89AB = vmaxq_f32(vout89AB, vmin);
        voutCDEF = vmaxq_f32(voutCDEF, vmin);
        vst1q_f32(output, vout0123);
        vst1q_f32(output + 4, vout4567);
        vst1q_f32(output + 8, vout89AB);
        vst1q_f32(output + 12, voutCDEF);
        output = (float*) ((uintptr_t) output + output_stride);
      } while (--n != 0);
      output = (float*) ((uintptr_t) output - output_decrement);
      input += 16;
    }
    output_decrement += 8 * sizeof(float);
    if (mc & (8 * sizeof(float))) {
      const float* w = weights;
      const int32_t* dmap = widx_dmap;
      const uint32_t* nnzmap = nidx_nnzmap;
      size_t n = nc;
      do {
        uint32_t nnz = *nnzmap++;
        float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
        float32x4_t vacc4567 = vacc0123;
        if XNN_LIKELY(nnz != 0) {
          do {
            const intptr_t diff = *dmap++;
            const float32x4_t vi0123 = vld1q_f32(input);
            const float32x4_t vi4567 = vld1q_f32(input + 4);
            input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
            const float32x4_t vw = vld1q_dup_f32(w); w += 1;
            vacc0123 = vmlaq_f32(vacc0123, vi0123, vw);
            vacc4567 = vmlaq_f32(vacc4567, vi4567, vw);
          } while (--nnz != 0);
        }
        float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
        float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
        vout0123 = vmaxq_f32(vout0123, vmin);
        vout4567 = vmaxq_f32(vout4567, vmin);
        vst1q_f32(output, vout0123);
        vst1q_f32(output + 4, vout4567);
        output = (float*) ((uintptr_t) output + output_stride);
      } while (--n != 0);
      output = (float*) ((uintptr_t) output - output_decrement);
      input += 8;
    }
    output_decrement += 4 * sizeof(float);
    if (mc & (4 * sizeof(float))) {
      const float* w = weights;
      const int32_t* dmap = widx_dmap;
      const uint32_t* nnzmap = nidx_nnzmap;
      size_t n = nc;
      do {
        uint32_t nnz = *nnzmap++;
        float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
        if XNN_LIKELY(nnz != 0) {
          do {
            const intptr_t diff = *dmap++;
            const float32x4_t vi0123 = vld1q_f32(input);
            input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
            const float32x4_t vw = vld1q_dup_f32(w); w += 1;
            vacc0123 = vmlaq_f32(vacc0123, vi0123, vw);
          } while (--nnz != 0);
        }
        float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
        vout0123 = vmaxq_f32(vout0123, vmin);
        vst1q_f32(output, vout0123);
        output = (float*) ((uintptr_t) output + output_stride);
      } while (--n != 0);
      output = (float*) ((uintptr_t) output - output_decrement);
      input += 4;
    }
    output_decrement += 2 * sizeof(float);
    if (mc & (2 * sizeof(float))) {
      const float* w = weights;
      const int32_t* dmap = widx_dmap;
      const uint32_t* nnzmap = nidx_nnzmap;
      size_t n = nc;
      do {
        uint32_t nnz = *nnzmap++;
        float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
        if XNN_LIKELY(nnz != 0) {
          do {
            const intptr_t diff = *dmap++;
            const float32x2_t vi01 = vld1_f32(input);
            input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
            const float32x2_t vw = vld1_dup_f32(w); w += 1;
            vacc01 = vmla_f32(vacc01, vi01, vw);
          } while (--nnz != 0);
        }
        float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
        vout01 = vmax_f32(vout01, vget_low_f32(vmin));
        vst1_f32(output, vout01);
        output = (float*) ((uintptr_t) output + output_stride);
      } while (--n != 0);
      output = (float*) ((uintptr_t) output - output_decrement);
      input += 2;
    }
    output_decrement += 1 * sizeof(float);
    if (mc & (1 * sizeof(float))) {
      const float* w = weights;
      const int32_t* dmap = widx_dmap;
      const uint32_t* nnzmap = nidx_nnzmap;
      size_t n = nc;
      do {
        uint32_t nnz = *nnzmap++;
        float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
        if XNN_LIKELY(nnz != 0) {
          do {
            const intptr_t diff = *dmap++;
            const float32x2_t vi0 = vld1_dup_f32(input);
            input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
            const float32x2_t vw = vld1_dup_f32(w); w += 1;
            vacc0 = vmla_f32(vacc0, vi0, vw);
          } while (--nnz != 0);
        }
        float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
        vout0 = vmax_f32(vout0, vget_low_f32(vmin));
        vst1_lane_f32(output, vout0, 0);
        output = (float*) ((uintptr_t) output + output_stride);
      } while (--n != 0);
      output = (float*) ((uintptr_t) output - output_decrement);
      input += 1;
    }
  }
}

void xnn_f32_vadd_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vaddq_f32(va0, vb0);
    float32x4_t vacc1 = vaddq_f32(va1, vb1);


    vacc0 = vmaxq_f32(vacc0, voutput_min);
    vacc1 = vmaxq_f32(vacc1, voutput_min);

    vacc0 = vminq_f32(vacc0, voutput_max);
    vacc1 = vminq_f32(vacc1, voutput_max);

    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vaddq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vaddq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vaddc_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vaddq_f32(vacc_, vb);
    vaccl = vaddq_f32(vaccl, vb);


    vacc_ = vmaxq_f32(vacc_, voutput_min);
    vaccl = vmaxq_f32(vaccl, voutput_min);

    vacc_ = vminq_f32(vacc_, voutput_max);
    vaccl = vminq_f32(vaccl, voutput_max);

    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vaddq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vaddq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);


  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vmaxq_f32(va0, vb0);
    float32x4_t vacc1 = vmaxq_f32(va1, vb1);


    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vmaxq_f32(va, vb);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vmaxq_f32(va, vb);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmaxc_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vmaxq_f32(vacc_, vb);
    vaccl = vmaxq_f32(vaccl, vb);


    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vmaxq_f32(va, vb);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vmaxq_f32(va, vb);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmin_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);


  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vminq_f32(va0, vb0);
    float32x4_t vacc1 = vminq_f32(va1, vb1);


    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vminq_f32(va, vb);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vminq_f32(va, vb);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vminc_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vminq_f32(vacc_, vb);
    vaccl = vminq_f32(vaccl, vb);


    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vminq_f32(va, vb);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vminq_f32(va, vb);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmul_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vmulq_f32(va0, vb0);
    float32x4_t vacc1 = vmulq_f32(va1, vb1);


    vacc0 = vmaxq_f32(vacc0, voutput_min);
    vacc1 = vmaxq_f32(vacc1, voutput_min);

    vacc0 = vminq_f32(vacc0, voutput_max);
    vacc1 = vminq_f32(vacc1, voutput_max);

    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vmulq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vmulq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmulc_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vmulq_f32(vacc_, vb);
    vaccl = vmulq_f32(vaccl, vb);


    vacc_ = vmaxq_f32(vacc_, voutput_min);
    vaccl = vmaxq_f32(vaccl, voutput_min);

    vacc_ = vminq_f32(vacc_, voutput_max);
    vaccl = vminq_f32(vaccl, voutput_max);

    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vmulq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vmulq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vrsubc_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vsubq_f32(vb, vacc_);
    vaccl = vsubq_f32(vb, vaccl);


    vacc_ = vmaxq_f32(vacc_, voutput_min);
    vaccl = vmaxq_f32(vaccl, voutput_min);

    vacc_ = vminq_f32(vacc_, voutput_max);
    vaccl = vminq_f32(vaccl, voutput_max);

    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vsubq_f32(vb, va);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vsubq_f32(vb, va);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vsqrdiff_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);


  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vsubq_f32(va0, vb0);
    float32x4_t vacc1 = vsubq_f32(va1, vb1);

    vacc0 = vmulq_f32(vacc0, vacc0);
    vacc1 = vmulq_f32(vacc1, vacc1);


    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmulq_f32(vacc, vacc);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmulq_f32(vacc, vacc);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vsqrdiffc_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vsubq_f32(vacc_, vb);
    vaccl = vsubq_f32(vaccl, vb);

    vacc_ = vmulq_f32(vacc_, vacc_);
    vaccl = vmulq_f32(vaccl, vaccl);


    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmulq_f32(vacc, vacc);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmulq_f32(vacc, vacc);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vsub_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4;
    const float32x4_t va1 = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc0 = vsubq_f32(va0, vb0);
    float32x4_t vacc1 = vsubq_f32(va1, vb1);


    vacc0 = vmaxq_f32(vacc0, voutput_min);
    vacc1 = vmaxq_f32(vacc1, voutput_min);

    vacc0 = vminq_f32(vacc0, voutput_max);
    vacc1 = vminq_f32(vacc1, voutput_max);

    vst1q_f32(output, vacc0); output += 4;
    vst1q_f32(output, vacc1); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;
    const float32x4_t vb = vld1q_f32(input_b); input_b += 4;

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);
    const float32x4_t vb = vld1q_f32(input_b);

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vsubc_minmax_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
  const float32x4_t vb = vld1q_dup_f32(input_b);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4;
    float32x4_t vaccl = vld1q_f32(input_a); input_a += 4;

    vacc_ = vsubq_f32(vacc_, vb);
    vaccl = vsubq_f32(vaccl, vb);


    vacc_ = vmaxq_f32(vacc_, voutput_min);
    vaccl = vmaxq_f32(vaccl, voutput_min);

    vacc_ = vminq_f32(vacc_, voutput_max);
    vaccl = vminq_f32(vaccl, voutput_max);

    vst1q_f32(output, vacc_); output += 4;
    vst1q_f32(output, vaccl); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t va = vld1q_f32(input_a); input_a += 4;

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t va = vld1q_f32(input_a);

    float32x4_t vacc = vsubq_f32(va, vb);
    vacc = vmaxq_f32(vacc, voutput_min);
    vacc = vminq_f32(vacc, voutput_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vclamp_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vy_min = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vy_max = vld1q_dup_f32(&params->scalar.max);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vacc0123 = vld1q_f32(input); input += 4;
    float32x4_t vacc4567 = vld1q_f32(input); input += 4;

    vacc0123 = vmaxq_f32(vacc0123, vy_min);
    vacc4567 = vmaxq_f32(vacc4567, vy_min);

    vacc0123 = vminq_f32(vacc0123, vy_max);
    vacc4567 = vminq_f32(vacc4567, vy_max);

    vst1q_f32(output, vacc0123); output += 4;
    vst1q_f32(output, vacc4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    float32x4_t vacc = vld1q_f32(input); input += 4;
    vacc = vmaxq_f32(vacc, vy_min);
    vacc = vminq_f32(vacc, vy_max);
    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    float32x4_t vacc = vld1q_f32(input);
    vacc = vmaxq_f32(vacc, vy_min);
    vacc = vminq_f32(vacc, vy_max);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vcmul_ukernel__neon_x8(
    size_t batch,
    const float* input_a,
    const float* input_b,
    float* output,
    const union xnn_f32_default_params* params) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const float* ar = input_a;
  const float* ai = (const float*) ((uintptr_t) input_a + batch);
  const float* br = input_b;
  const float* bi = (const float*) ((uintptr_t) input_b + batch);
  float* or = output;
  float* oi = (float*) ((uintptr_t) output + batch);
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t va0r = vld1q_f32(ar); ar += 4;
    const float32x4_t va0i = vld1q_f32(ai); ai += 4;
    const float32x4_t vb0r = vld1q_f32(br); br += 4;
    const float32x4_t vb0i = vld1q_f32(bi); bi += 4;
    const float32x4_t va1r = vld1q_f32(ar); ar += 4;
    const float32x4_t va1i = vld1q_f32(ai); ai += 4;
    const float32x4_t vb1r = vld1q_f32(br); br += 4;
    const float32x4_t vb1i = vld1q_f32(bi); bi += 4;

    float32x4_t vacc0r = vmulq_f32(va0r, vb0r);
    float32x4_t vacc0i = vmulq_f32(va0r, vb0i);
    float32x4_t vacc1r = vmulq_f32(va1r, vb1r);
    float32x4_t vacc1i = vmulq_f32(va1r, vb1i);

    vacc0r = vmlsq_f32(vacc0r, va0i, vb0i);
    vacc0i = vmlaq_f32(vacc0i, va0i, vb0r);
    vacc1r = vmlsq_f32(vacc1r, va1i, vb1i);
    vacc1i = vmlaq_f32(vacc1i, va1i, vb1r);

    vst1q_f32(or, vacc0r); or += 4;
    vst1q_f32(oi, vacc0i); oi += 4;
    vst1q_f32(or, vacc1r); or += 4;
    vst1q_f32(oi, vacc1i); oi += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t var = vld1q_f32(ar); ar += 4;
    const float32x4_t vai = vld1q_f32(ai); ai += 4;
    const float32x4_t vbr = vld1q_f32(br); br += 4;
    const float32x4_t vbi = vld1q_f32(bi); bi += 4;

    float32x4_t vaccr = vmulq_f32(var, vbr);
    float32x4_t vacci = vmulq_f32(var, vbi);

    vaccr = vmlsq_f32(vaccr, vai, vbi);
    vacci = vmlaq_f32(vacci, vai, vbr);

    vst1q_f32(or, vaccr); or += 4;
    vst1q_f32(oi, vacci); oi += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t var = vld1q_f32(ar); ar += 4;
    const float32x4_t vai = vld1q_f32(ai); ai += 4;
    const float32x4_t vbr = vld1q_f32(br); br += 4;
    const float32x4_t vbi = vld1q_f32(bi); bi += 4;

    float32x4_t vaccr = vmulq_f32(var, vbr);
    float32x4_t vacci = vmulq_f32(var, vbi);

    vaccr = vmlsq_f32(vaccr, vai, vbi);
    vacci = vmlaq_f32(vacci, vai, vbr);

    float32x2_t vaccr_lo = vget_low_f32(vaccr);
    float32x2_t vacci_lo = vget_low_f32(vacci);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(or, vaccr_lo); or += 2;
      vst1_f32(oi, vacci_lo); oi += 2;
      vaccr_lo = vget_high_f32(vaccr);
      vacci_lo = vget_high_f32(vacci);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(or, vaccr_lo, 0);
      vst1_lane_f32(oi, vacci_lo, 0);
    }
  }
}

extern XNN_INTERNAL const int32_t xnn_table_exp2minus_k_over_16[16];

void xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vprescale = vld1q_dup_f32(&params->neon_rr2_lut16_p3.prescale);
  const float32x4_t valpha = vld1q_dup_f32(&params->neon_rr2_lut16_p3.alpha);
  const float32x4_t vbeta = vld1q_dup_f32(&params->neon_rr2_lut16_p3.beta);
  const float32x4_t vsat_cutoff = vld1q_dup_f32(&params->neon_rr2_lut16_p3.sat_cutoff);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon_rr2_lut16_p3.magic_bias);
  const float32x4_t vlog2e = vld1q_dup_f32(&params->neon_rr2_lut16_p3.log2e);
  const int32x4_t vindex_mask = vmovq_n_s32(0xF);
  const float32x4_t vminus_ln2_hi = vld1q_dup_f32(&params->neon_rr2_lut16_p3.minus_ln2_hi);
  const float32x4_t vminus_ln2_lo = vld1q_dup_f32(&params->neon_rr2_lut16_p3.minus_ln2_lo);
  const float32x4_t vc3 = vld1q_dup_f32(&params->neon_rr2_lut16_p3.c3);
  const float32x4_t vc2 = vld1q_dup_f32(&params->neon_rr2_lut16_p3.c2);
  const float32x4_t vone = vmovq_n_f32(1.0f);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    float32x4_t vx0123 = vld1q_f32(input); input += 4;
    float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff);
    const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff);

    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vlog2e);
    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vlog2e);

    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask), 2));
    const int32x4_t ven0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 19);
    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask), 2));
    const int32x4_t ven4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 19);

    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
    int32x2_t vl01 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01));
    int32x2_t vl23 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23));
    vl01 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)), vl01, 1);
    vl23 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)), vl23, 1);
    const int32x4_t vl0123 = vcombine_s32(vl01, vl23);
    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
    int32x2_t vl45 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45));
    int32x2_t vl67 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67));
    vl45 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)), vl45, 1);
    vl67 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)), vl67, 1);
    const int32x4_t vl4567 = vcombine_s32(vl45, vl67);

    vn0123 = vsubq_f32(vn0123, vmagic_bias);
    float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vl0123, ven0123));
    vn4567 = vsubq_f32(vn4567, vmagic_bias);
    float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vl4567, ven4567));

    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vminus_ln2_hi);
    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vminus_ln2_hi);

    vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo);
    vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo);

    float32x4_t vp0123 = vmlaq_f32(vc2, vc3, vt0123);
    float32x4_t vp4567 = vmlaq_f32(vc2, vc3, vt4567);

    vp0123 = vmulq_f32(vp0123, vt0123);
    vp4567 = vmulq_f32(vp4567, vt4567);

    vt0123 = vmulq_f32(vt0123, vs0123);
    vs0123 = vsubq_f32(vs0123, vone);
    vt4567 = vmulq_f32(vt4567, vs4567);
    vs4567 = vsubq_f32(vs4567, vone);

    vp0123 = vmlaq_f32(vt0123, vp0123, vt0123);
    vp4567 = vmlaq_f32(vt4567, vp4567, vt4567);

    const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha);
    const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha);

    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
    vx0123 = vmulq_f32(vx0123, vbeta);
    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
    vx4567 = vmulq_f32(vx4567, vbeta);

    const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123);
    const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    float32x4_t vx = vld1q_f32(input); input += 4;

    const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e);
    const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2));
    const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19);

    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
    int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
    vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
    vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);

    vn = vsubq_f32(vn, vmagic_bias);
    const int32x4_t vl = vcombine_s32(vl_lo, vl_hi);

    float32x4_t vt = vmlaq_f32(vz, vn, vminus_ln2_hi);
    vt = vmlaq_f32(vt, vn, vminus_ln2_lo);
    float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven));

    float32x4_t vp = vmlaq_f32(vc2, vc3, vt);
    vp = vmulq_f32(vp, vt);

    vt = vmulq_f32(vt, vs);
    vs = vsubq_f32(vs, vone);
    vp = vmlaq_f32(vt, vp, vt);
    const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);

    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
    vx = vmulq_f32(vx, vbeta);
    const float32x4_t vy = vbslq_f32(vm, ve, vx);

    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    float32x4_t vx = vld1q_f32(input);

    const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e);
    const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2));
    const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19);

    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
    int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
    vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
    vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);

    vn = vsubq_f32(vn, vmagic_bias);
    const int32x4_t vl = vcombine_s32(vl_lo, vl_hi);

    float32x4_t vt = vmlaq_f32(vz, vn, vminus_ln2_hi);
    vt = vmlaq_f32(vt, vn, vminus_ln2_lo);
    float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven));

    float32x4_t vp = vmlaq_f32(vc2, vc3, vt);
    vp = vmulq_f32(vp, vt);

    vt = vmulq_f32(vt, vs);
    vs = vsubq_f32(vs, vone);
    vp = vmlaq_f32(vt, vp, vt);
    const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);

    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
    vx = vmulq_f32(vx, vbeta);
    const float32x4_t vy = vbslq_f32(vm, ve, vx);

    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vhswish_ukernel__neon_x16(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
  const float32x4_t vthree = vld1q_dup_f32(&params->scalar.three);
  const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(&params->scalar.six));
  const int32x4_t vzero = vdupq_n_s32(0);

  for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
    float32x4_t vx0123 = vld1q_f32(input); input += 4;
    float32x4_t vx4567 = vld1q_f32(input); input += 4;
    float32x4_t vx89AB = vld1q_f32(input); input += 4;
    float32x4_t vxCDEF = vld1q_f32(input); input += 4;

    float32x4_t vacc0123 = vaddq_f32(vx0123, vthree);
    vx0123 = vmulq_f32(vx0123, vsixth);
    float32x4_t vacc4567 = vaddq_f32(vx4567, vthree);
    vx4567 = vmulq_f32(vx4567, vsixth);
    float32x4_t vacc89AB = vaddq_f32(vx89AB, vthree);
    vx89AB = vmulq_f32(vx89AB, vsixth);
    float32x4_t vaccCDEF = vaddq_f32(vxCDEF, vthree);
    vxCDEF = vmulq_f32(vxCDEF, vsixth);

    vacc0123 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc0123), vzero));
    vacc4567 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc4567), vzero));
    vacc89AB = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc89AB), vzero));
    vaccCDEF = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vaccCDEF), vzero));

    vacc0123 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc0123), vsix));
    vacc4567 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc4567), vsix));
    vacc89AB = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc89AB), vsix));
    vaccCDEF = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vaccCDEF), vsix));

    vacc0123 = vmulq_f32(vacc0123, vx0123);
    vacc4567 = vmulq_f32(vacc4567, vx4567);
    vacc89AB = vmulq_f32(vacc89AB, vx89AB);
    vaccCDEF = vmulq_f32(vaccCDEF, vxCDEF);

    vst1q_f32(output, vacc0123); output += 4;
    vst1q_f32(output, vacc4567); output += 4;
    vst1q_f32(output, vacc89AB); output += 4;
    vst1q_f32(output, vaccCDEF); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    float32x4_t vx = vld1q_f32(input); input += 4;
    float32x4_t vacc = vaddq_f32(vx, vthree);
    vx = vmulq_f32(vx, vsixth);
    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
    vacc = vmulq_f32(vacc, vx);
    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    float32x4_t vx = vld1q_f32(input);
    float32x4_t vacc = vaddq_f32(vx, vthree);
    vx = vmulq_f32(vx, vsixth);
    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
    vacc = vmulq_f32(vacc, vx);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vlrelu_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vslope = vld1q_dup_f32(&params->scalar.slope);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    float32x4_t vacc0123 = vmulq_f32(vx0123, vslope);
    const uint32x4_t vmask0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
    float32x4_t vacc4567 = vmulq_f32(vx4567, vslope);
    const uint32x4_t vmask4567 = vcltq_s32(vreinterpretq_s32_f32(vx4567), vmovq_n_s32(0));

    vacc0123 = vbslq_f32(vmask0123, vacc0123, vx0123);
    vacc4567 = vbslq_f32(vmask4567, vacc4567, vx4567);

    vst1q_f32(output, vacc0123); output += 4;
    vst1q_f32(output, vacc4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    float32x4_t vacc = vmulq_f32(vx, vslope);
    const uint32x4_t vmask = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
    vacc = vbslq_f32(vmask, vacc, vx);
    vst1q_f32(output, vacc); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    float32x4_t vacc = vmulq_f32(vx, vslope);
    const uint32x4_t vmask = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
    vacc = vbslq_f32(vmask, vacc, vx);

    float32x2_t vacc_lo = vget_low_f32(vacc);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vacc_lo); output += 2;
      vacc_lo = vget_high_f32(vacc);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vacc_lo, 0);
    }
  }
}

void xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x(
    size_t rows,
    size_t channels,
    const float* restrict input,
    size_t input_stride,
    const float* restrict weights,
    float* restrict output,
    size_t output_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows != 0);
  assert(channels != 0);
  assert(channels % sizeof(float) == 0);

  const float* i0 = input;
  float* o0 = output;
  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
  float* o1 = (float*) ((uintptr_t) o0 + output_stride);

  const size_t input_increment = input_stride * 2 - channels;
  const size_t output_increment = output_stride * 2 - channels;

  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
  do {
    if XNN_UNPREDICTABLE(rows < 2) {
      i1 = i0;
      o1 = o0;
    }

    const float* w = weights;
    size_t c = channels;
    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
      const float32x4_t vscale0123 = vld1q_f32(w); w += 4;

      float32x4_t vacc0x0123 = vld1q_f32(i0); i0 += 4;
      float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4;

      vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
      vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123);

      const float32x4_t vbias0123 = vld1q_f32(w); w += 4;

      vacc0x0123 = vaddq_f32(vacc0x0123, vbias0123);
      vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123);

      vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
      vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);

      vacc0x0123 = vminq_f32(vacc0x0123, vmax);
      vacc1x0123 = vminq_f32(vacc1x0123, vmax);

      vst1q_f32(o0, vacc0x0123); o0 += 4;
      vst1q_f32(o1, vacc1x0123); o1 += 4;
    }
    if XNN_UNLIKELY(c != 0) {
      const float32x4_t vscale0123 = vld1q_f32(w);

      float32x4_t vacc0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c);
      float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c);

      vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123);
      vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123);

      const float32x4_t vbias0123 = vld1q_f32(w + 4);

      vacc0x0123 = vaddq_f32(vacc0x0123, vbias0123);
      vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123);

      vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
      vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);

      vacc0x0123 = vminq_f32(vacc0x0123, vmax);
      vacc1x0123 = vminq_f32(vacc1x0123, vmax);

      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
      if (c & (2 * sizeof(float))) {
        vst1_f32(o0, vacc0x01); o0 += 2;
        vst1_f32(o1, vacc1x01); o1 += 2;

        vacc0x01 = vget_high_f32(vacc0x0123);
        vacc1x01 = vget_high_f32(vacc1x0123);
      }
      if (c & (1 * sizeof(float))) {
        vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
        vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
      }
    }
    i0 = (const float*) ((uintptr_t) i0 + input_increment);
    o0 = (float*) ((uintptr_t) o0 + output_increment);
    i1 = (const float*) ((uintptr_t) i1 + input_increment);
    o1 = (float*) ((uintptr_t) o1 + output_increment);
    rows = doz(rows, 2);
  } while (rows != 0);
}

void xnn_f32_vrndd_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000)));
  const uint32x4_t vone = vreinterpretq_u32_f32(vmovq_n_f32(1.0f));
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123);
    const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567);

    uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold);
    uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold);

    const float32x4_t vprerndx0123 = vcvtq_f32_s32(vintx0123);
    const float32x4_t vprerndx4567 = vcvtq_f32_s32(vintx4567);

    vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000)));

    const float32x4_t vrndx0123 = vbslq_f32(vrndmask0123, vprerndx0123, vx0123);
    const float32x4_t vrndx4567 = vbslq_f32(vrndmask4567, vprerndx4567, vx4567);

    const uint32x4_t vadjmask0123 = vcgtq_f32(vrndx0123, vx0123);
    const uint32x4_t vadjmask4567 = vcgtq_f32(vrndx4567, vx4567);

    const float32x4_t vadjrndx0123 = vreinterpretq_f32_u32(vandq_u32(vadjmask0123, vone));
    const float32x4_t vadjrndx4567 = vreinterpretq_f32_u32(vandq_u32(vadjmask4567, vone));

    const float32x4_t vy0123 = vsubq_f32(vrndx0123, vadjrndx0123);
    const float32x4_t vy4567 = vsubq_f32(vrndx4567, vadjrndx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    const float32x4_t vprerndx = vcvtq_f32_s32(vintx);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx);
    const uint32x4_t vadjmask = vcgtq_f32(vrndx, vx);
    const float32x4_t vadjrndx = vreinterpretq_f32_u32(vandq_u32(vadjmask, vone));
    const float32x4_t vy = vsubq_f32(vrndx, vadjrndx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    const float32x4_t vprerndx = vcvtq_f32_s32(vintx);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx);
    const uint32x4_t vadjmask = vcgtq_f32(vrndx, vx);
    const float32x4_t vadjrndx = vreinterpretq_f32_u32(vandq_u32(vadjmask, vone));
    const float32x4_t vy = vsubq_f32(vrndx, vadjrndx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vrndne_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vmagic_number = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000)));
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vabsx0123 = vabsq_f32(vx0123);
    uint32x4_t vrndmask0123 = vcaltq_f32(vmagic_number, vx0123);
    const float32x4_t vabsx4567 = vabsq_f32(vx4567);
    uint32x4_t vrndmask4567 = vcaltq_f32(vmagic_number, vx4567);

    float32x4_t vrndabsx0123 = vaddq_f32(vabsx0123, vmagic_number);
    float32x4_t vrndabsx4567 = vaddq_f32(vabsx4567, vmagic_number);

    vrndmask0123 = vorrq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndmask4567 = vorrq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000)));

    vrndabsx0123 = vsubq_f32(vrndabsx0123, vmagic_number);
    vrndabsx4567 = vsubq_f32(vrndabsx4567, vmagic_number);

    const float32x4_t vy0123 = vbslq_f32(vrndmask0123, vx0123, vrndabsx0123);
    const float32x4_t vy4567 = vbslq_f32(vrndmask4567, vx4567, vrndabsx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const float32x4_t vabsx = vabsq_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vmagic_number, vx);
    float32x4_t vrndabsx = vaddq_f32(vabsx, vmagic_number);
    vrndmask = vorrq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndabsx = vsubq_f32(vrndabsx, vmagic_number);
    const float32x4_t vy = vbslq_f32(vrndmask, vx, vrndabsx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const float32x4_t vabsx = vabsq_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vmagic_number, vx);
    float32x4_t vrndabsx = vaddq_f32(vabsx, vmagic_number);
    vrndmask = vorrq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndabsx = vsubq_f32(vrndabsx, vmagic_number);
    const float32x4_t vy = vbslq_f32(vrndmask, vx, vrndabsx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vrndu_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000)));
  const float32x4_t vone = vmovq_n_f32(1.0f);
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123);
    const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567);

    uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold);
    uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold);

    const float32x4_t vprerndx0123 = vcvtq_f32_s32(vintx0123);
    const float32x4_t vprerndx4567 = vcvtq_f32_s32(vintx4567);

    vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000)));

    const float32x4_t vrndx0123 = vbslq_f32(vrndmask0123, vprerndx0123, vx0123);
    const float32x4_t vrndx4567 = vbslq_f32(vrndmask4567, vprerndx4567, vx4567);

    uint32x4_t vadjmask0123 = vcgeq_f32(vrndx0123, vx0123);
    uint32x4_t vadjmask4567 = vcgeq_f32(vrndx4567, vx4567);

    const float32x4_t vadjrndx0123 = vaddq_f32(vrndx0123, vone);
    const float32x4_t vadjrndx4567 = vaddq_f32(vrndx4567, vone);

    vadjmask0123 = vorrq_u32(vadjmask0123, vmovq_n_u32(UINT32_C(0x80000000)));
    vadjmask4567 = vorrq_u32(vadjmask4567, vmovq_n_u32(UINT32_C(0x80000000)));

    const float32x4_t vy0123 = vbslq_f32(vadjmask0123, vrndx0123, vadjrndx0123);
    const float32x4_t vy4567 = vbslq_f32(vadjmask4567, vrndx4567, vadjrndx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    const float32x4_t vprerndx = vcvtq_f32_s32(vintx);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx);
    uint32x4_t vadjmask = vcgeq_f32(vrndx, vx);
    const float32x4_t vadjrndx = vaddq_f32(vrndx, vone);
    vadjmask = vorrq_u32(vadjmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vy = vbslq_f32(vadjmask, vrndx, vadjrndx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    const float32x4_t vprerndx = vcvtq_f32_s32(vintx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx);
    uint32x4_t vadjmask = vcgeq_f32(vrndx, vx);
    const float32x4_t vadjrndx = vaddq_f32(vrndx, vone);
    vadjmask = vorrq_u32(vadjmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vy = vbslq_f32(vadjmask, vrndx, vadjrndx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vrndz_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000)));
  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123);
    const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567);

    uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold);
    uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold);

    const float32x4_t vrndx0123 = vcvtq_f32_s32(vintx0123);
    const float32x4_t vrndx4567 = vcvtq_f32_s32(vintx4567);

    vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000)));
    vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000)));

    const float32x4_t vy0123 = vbslq_f32(vrndmask0123, vrndx0123, vx0123);
    const float32x4_t vy4567 = vbslq_f32(vrndmask4567, vrndx4567, vx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    const float32x4_t vrndx = vcvtq_f32_s32(vintx);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vy = vbslq_f32(vrndmask, vrndx, vx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const int32x4_t vintx = vcvtq_s32_f32(vx);
    uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
    const float32x4_t vrndx = vcvtq_f32_s32(vintx);
    vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000)));
    const float32x4_t vy = vbslq_f32(vrndmask, vrndx, vx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64];

void xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon_rr2_lut64_p2.magic_bias);
  const float32x4_t vminus_log2e = vld1q_dup_f32(&params->neon_rr2_lut64_p2.minus_log2e);
  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F));
  const float32x4_t vln2_hi = vld1q_dup_f32(&params->neon_rr2_lut64_p2.ln2_hi);
  const float32x4_t vln2_lo = vld1q_dup_f32(&params->neon_rr2_lut64_p2.ln2_lo);
  const float32x4_t vc2 = vld1q_dup_f32(&params->neon_rr2_lut64_p2.c2);
  const float32x4_t vone = vmovq_n_f32(1.0f);
  const float32x4_t vdenorm_cutoff = vld1q_dup_f32(&params->neon_rr2_lut64_p2.denorm_cutoff);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vz0123 = vabsq_f32(vx0123);
    const float32x4_t vz4567 = vabsq_f32(vx4567);

    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);

    const int32x4_t ve0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 17);
    const int32x4_t ve4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 17);

    // Use bits 0:6 bits of batch, as integer, as an index for table lookup of l := 2**(batch % 64).
    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));

    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx01]);
    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx23]);
    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx45]);
    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx67]);

    vl01 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1);
    vl23 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1);
    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
    vl45 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1);
    vl67 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1);
    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);

    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));

    vn0123 = vsubq_f32(vn0123, vmagic_bias);
    vn4567 = vsubq_f32(vn4567, vmagic_bias);

    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);

    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);

    float32x4_t vp0123 = vmulq_f32(vt0123, vc2);
    float32x4_t vp4567 = vmulq_f32(vt4567, vc2);

    vp0123 = vmlsq_f32(vt0123, vp0123, vt0123);
    vp4567 = vmlsq_f32(vt4567, vp4567, vt4567);

    const float32x4_t vy0123 = vmlsq_f32(vs0123, vs0123, vp0123);
    const float32x4_t vy4567 = vmlsq_f32(vs4567, vs4567, vp4567);

    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);

    float32x4_t vr0123 = vrecpeq_f32(vd0123);
    float32x4_t vr4567 = vrecpeq_f32(vd4567);

    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));

    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));

    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);

    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));

    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));

    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));

    vst1q_f32(output, vf0123); output += 4;
    vst1q_f32(output, vf4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;

    const float32x4_t vz = vabsq_f32(vx);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
    const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17);

    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]);
    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]);
    vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
    vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);

    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
    vn = vsubq_f32(vn, vmagic_bias);
    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
    vt = vmlaq_f32(vt, vn, vln2_lo);

    float32x4_t vp = vmulq_f32(vt, vc2);
    vp = vmlsq_f32(vt, vp, vt);

    const float32x4_t vy = vmlsq_f32(vs, vs, vp);
    const float32x4_t vd = vaddq_f32(vy, vone);

    float32x4_t vr = vrecpeq_f32(vd);
    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));

    float32x4_t vf = vmulq_f32(vy, vr);
    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));

    vst1q_f32(output, vf); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);

    const float32x4_t vz = vabsq_f32(vx);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
    const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17);

    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]);
    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]);
    vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
    vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);

    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
    vn = vsubq_f32(vn, vmagic_bias);
    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
    vt = vmlaq_f32(vt, vn, vln2_lo);

    float32x4_t vp = vmulq_f32(vt, vc2);
    vp = vmlsq_f32(vt, vp, vt);

    const float32x4_t vy = vmlsq_f32(vs, vs, vp);
    const float32x4_t vd = vaddq_f32(vy, vone);

    float32x4_t vr = vrecpeq_f32(vd);
    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));

    float32x4_t vf = vmulq_f32(vy, vr);
    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));

    float32x2_t vf_lo = vget_low_f32(vf);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vf_lo); output += 2;
      vf_lo = vget_high_f32(vf);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vf_lo, 0);
    }
  }
}

void xnn_f32_vtanh_ukernel__neon_expm1minus_rr1_p6h5ts_nr2recps_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  const float32x4_t vsat_cutoff = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.sat_cutoff);
  const float32x4_t vminus_log2e = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.minus_log2e);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.magic_bias);

  const float32x4_t vln2 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.ln2);

  const float32x4_t vc6 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c6);
  const float32x4_t vc5 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c5);
  const float32x4_t vc4 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c4);
  const float32x4_t vc3 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c3);
  const float32x4_t vc2 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c2);

  const float32x4_t vone = vmovq_n_f32(1.0f);
  const float32x4_t vtwo = vmovq_n_f32(2.0f);

  const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000));

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    float32x4_t vz0123 = vabsq_f32(vx0123);
    float32x4_t vz4567 = vabsq_f32(vx4567);
    vz0123 = vminq_f32(vz0123, vsat_cutoff);
    vz4567 = vminq_f32(vz4567, vsat_cutoff);

    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);

    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));

    vn0123 = vsubq_f32(vn0123, vmagic_bias);
    vn4567 = vsubq_f32(vn4567, vmagic_bias);

    const float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2);
    const float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2);

    float32x4_t vp0123 = vmlaq_f32(vc5, vc6, vt0123);
    float32x4_t vp4567 = vmlaq_f32(vc5, vc6, vt4567);
    vp0123 = vmlaq_f32(vc4, vp0123, vt0123);
    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
    vp4567 = vmlaq_f32(vc4, vp4567, vt4567);
    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
    vp0123 = vmlsq_f32(vtwo, vp0123, vt0123);
    vp4567 = vmlsq_f32(vtwo, vp4567, vt4567);

    const float32x4_t vts0123 = vmulq_f32(vt0123, vs0123);
    const float32x4_t vsmo0123 = vsubq_f32(vs0123, vone);
    const float32x4_t vts4567 = vmulq_f32(vt4567, vs4567);
    const float32x4_t vsmo4567 = vsubq_f32(vs4567, vone);
    const float32x4_t vemo0123 = vmlsq_f32(vsmo0123, vp0123, vts0123);
    const float32x4_t vemo4567 = vmlsq_f32(vsmo4567, vp4567, vts4567);

    const float32x4_t vepo0123 = vaddq_f32(vemo0123, vtwo);
    const float32x4_t vepo4567 = vaddq_f32(vemo4567, vtwo);

    float32x4_t vrepo0123 = vrecpeq_f32(vepo0123);
    float32x4_t vrepo4567 = vrecpeq_f32(vepo4567);
    float32x4_t verepo0123 = vrecpsq_f32(vrepo0123, vepo0123);
    float32x4_t verepo4567 = vrecpsq_f32(vrepo4567, vepo4567);
    vrepo0123 = vmulq_f32(vrepo0123, verepo0123);
    vrepo4567 = vmulq_f32(vrepo4567, verepo4567);
    verepo0123 = vrecpsq_f32(vrepo0123, vepo0123);
    verepo4567 = vrecpsq_f32(vrepo4567, vepo4567);
    vrepo0123 = vmulq_f32(vrepo0123, verepo0123);
    vrepo4567 = vmulq_f32(vrepo4567, verepo4567);

    float32x4_t vy0123 = vmulq_f32(vemo0123, vrepo0123);
    float32x4_t vy4567 = vmulq_f32(vemo4567, vrepo4567);

    vy0123 = vbslq_f32(vsign_mask, vx0123, vy0123);
    vy4567 = vbslq_f32(vsign_mask, vx4567, vy4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }

  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;

    float32x4_t vz = vabsq_f32(vx);
    vz = vminq_f32(vz, vsat_cutoff);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);

    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));

    vn = vsubq_f32(vn, vmagic_bias);

    const float32x4_t vt = vmlaq_f32(vz, vn, vln2);

    float32x4_t vp = vmlaq_f32(vc5, vc6, vt);
    vp = vmlaq_f32(vc4, vp, vt);
    vp = vmlaq_f32(vc3, vp, vt);
    vp = vmlaq_f32(vc2, vp, vt);
    vp = vmlsq_f32(vtwo, vp, vt);

    const float32x4_t vts = vmulq_f32(vt, vs);
    const float32x4_t vsmo = vsubq_f32(vs, vone);
    const float32x4_t vemo = vmlsq_f32(vsmo, vp, vts);

    const float32x4_t vepo = vaddq_f32(vemo, vtwo);

    float32x4_t vrepo = vrecpeq_f32(vepo);
    float32x4_t verepo = vrecpsq_f32(vrepo, vepo);
    vrepo = vmulq_f32(vrepo, verepo);
    verepo = vrecpsq_f32(vrepo, vepo);
    vrepo = vmulq_f32(vrepo, verepo);

    float32x4_t vy = vmulq_f32(vemo, vrepo);

    vy = vbslq_f32(vsign_mask, vx, vy);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);

    float32x4_t vz = vabsq_f32(vx);
    vz = vminq_f32(vz, vsat_cutoff);

    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);

    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));

    vn = vsubq_f32(vn, vmagic_bias);

    const float32x4_t vt = vmlaq_f32(vz, vn, vln2);

    float32x4_t vp = vmlaq_f32(vc5, vc6, vt);
    vp = vmlaq_f32(vc4, vp, vt);
    vp = vmlaq_f32(vc3, vp, vt);
    vp = vmlaq_f32(vc2, vp, vt);
    vp = vmlsq_f32(vtwo, vp, vt);

    const float32x4_t vts = vmulq_f32(vt, vs);
    const float32x4_t vsmo = vsubq_f32(vs, vone);
    const float32x4_t vemo = vmlsq_f32(vsmo, vp, vts);

    const float32x4_t vepo = vaddq_f32(vemo, vtwo);

    float32x4_t vrepo = vrecpeq_f32(vepo);
    float32x4_t verepo = vrecpsq_f32(vrepo, vepo);
    vrepo = vmulq_f32(vrepo, verepo);
    verepo = vrecpsq_f32(vrepo, vepo);
    vrepo = vmulq_f32(vrepo, verepo);

    float32x4_t vy = vmulq_f32(vemo, vrepo);

    vy = vbslq_f32(vsign_mask, vx, vy);

    float32x2_t vy_low = vget_low_f32(vy);

    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_low); output += 2;
      vy_low = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_low, 0);
    }
  }
}

void xnn_f32_vabs_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vy0123 = vabsq_f32(vx0123);
    const float32x4_t vy4567 = vabsq_f32(vx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const float32x4_t vy = vabsq_f32(vx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const float32x4_t vy = vabsq_f32(vx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vneg_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vy0123 = vnegq_f32(vx0123);
    const float32x4_t vy4567 = vnegq_f32(vx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const float32x4_t vy = vnegq_f32(vx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const float32x4_t vy = vnegq_f32(vx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_f32_vsqr_ukernel__neon_x8(
    size_t batch,
    const float* input,
    float* output,
    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(float) == 0);
  assert(input != NULL);
  assert(output != NULL);

  for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
    const float32x4_t vx0123 = vld1q_f32(input); input += 4;
    const float32x4_t vx4567 = vld1q_f32(input); input += 4;

    const float32x4_t vy0123 = vmulq_f32(vx0123, vx0123);
    const float32x4_t vy4567 = vmulq_f32(vx4567, vx4567);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
  }
  for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
    const float32x4_t vx = vld1q_f32(input); input += 4;
    const float32x4_t vy = vmulq_f32(vx, vx);
    vst1q_f32(output, vy); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    const float32x4_t vx = vld1q_f32(input);
    const float32x4_t vy = vmulq_f32(vx, vx);
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(float))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(float))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  float* c0 = c;

  do {
    const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    const int32x4_t vksum89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
    const int32x4_t vksumCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

    const int32x4_t vzp0 = vld1q_dup_s32(&quantization_params[0].zero_point);
    int32x4_t vacc0x0123 = vmulq_s32(vksum0123, vzp0);
    int32x4_t vacc0x4567 = vmulq_s32(vksum4567, vzp0);
    int32x4_t vacc0x89AB = vmulq_s32(vksum89AB, vzp0);
    int32x4_t vacc0xCDEF = vmulq_s32(vksumCDEF, vzp0);

    size_t k = kc;
    while (k >= 8 * sizeof(int8_t)) {
      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);


      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(int8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(int8_t)) {
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(int8_t)) {
            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(int8_t)) {
                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567);
    float32x4_t vout0x89AB = vcvtq_f32_s32(vacc0x89AB);
    float32x4_t vout0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
    const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbscale89AB = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbscaleCDEF = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbias89AB = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vbiasCDEF = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vinput_scale0 = vld1q_dup_f32(&quantization_params[0].inv_scale);
    const float32x4_t vscale0x0123 = vmulq_f32(vbscale0123, vinput_scale0);
    const float32x4_t vscale0x4567 = vmulq_f32(vbscale4567, vinput_scale0);
    const float32x4_t vscale0x89AB = vmulq_f32(vbscale89AB, vinput_scale0);
    const float32x4_t vscale0xCDEF = vmulq_f32(vbscaleCDEF, vinput_scale0);
    #if XNN_ARCH_ARM64
      vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123);
      vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567);
      vout0x89AB = vfmaq_f32(vbias89AB, vout0x89AB, vscale0x89AB);
      vout0xCDEF = vfmaq_f32(vbiasCDEF, vout0xCDEF, vscale0xCDEF);
    #else
      vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123);
      vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567);
      vout0x89AB = vmlaq_f32(vbias89AB, vout0x89AB, vscale0x89AB);
      vout0xCDEF = vmlaq_f32(vbiasCDEF, vout0xCDEF, vscale0xCDEF);
    #endif

    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
    vout0x0123 = vmaxq_f32(vout0x0123, vmin);
    vout0x4567 = vmaxq_f32(vout0x4567, vmin);
    vout0x89AB = vmaxq_f32(vout0x89AB, vmin);
    vout0xCDEF = vmaxq_f32(vout0xCDEF, vmin);
    vout0x0123 = vminq_f32(vout0x0123, vmax);
    vout0x4567 = vminq_f32(vout0x4567, vmax);
    vout0x89AB = vminq_f32(vout0x89AB, vmax);
    vout0xCDEF = vminq_f32(vout0xCDEF, vmax);
    if XNN_LIKELY(nc >= 16) {
      vst1q_f32(&c0[0], vout0x0123);
      vst1q_f32(&c0[4], vout0x4567);
      vst1q_f32(&c0[8], vout0x89AB);
      vst1q_f32(&c0[12], vout0xCDEF);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      nc -= 16;
    } else {
      if (nc & 8) {
        vst1q_f32(c0, vout0x0123); c0 += 4;
        vout0x0123 = vout0x89AB;
        vst1q_f32(c0, vout0x4567); c0 += 4;
        vout0x4567 = vout0xCDEF;
      }
      if (nc & 4) {
        vst1q_f32(c0, vout0x0123); c0 += 4;
        vout0x0123 = vout0x4567;
      }
      float32x2_t vout0x01 = vget_low_f32(vout0x0123);
      if (nc & 2) {
        vst1_f32(c0, vout0x01); c0 += 2;
        vout0x01 = vget_high_f32(vout0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vout0x01, 0);
      }
      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  float* c0 = c;

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    const int32x4_t vizp0 = vld1q_dup_s32(&quantization_params[0].zero_point);
    const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x0123 = vmulq_s32(vksum0123, vizp0);
    const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vmulq_s32(vksum4567, vizp0);

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

    }

    float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567);

    const float32x4_t vinput_scale0 = vld1q_dup_f32(&quantization_params[0].inv_scale);
    const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vscale0x0123 = vmulq_f32(vinput_scale0, vbscale0123);
    const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vscale0x4567 = vmulq_f32(vinput_scale0, vbscale4567);
    const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4;
    #if XNN_ARCH_ARM64
      vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123);
    #else
      vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123);
    #endif
    const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4;
    #if XNN_ARCH_ARM64
      vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567);
    #else
      vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567);
    #endif

    const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
    vout0x0123 = vmaxq_f32(vout0x0123, voutput_min);
    vout0x4567 = vmaxq_f32(vout0x4567, voutput_min);

    const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
    vout0x0123 = vminq_f32(vout0x0123, voutput_max);
    vout0x4567 = vminq_f32(vout0x4567, voutput_max);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c0, vout0x0123);
      vst1q_f32(c0 + 0, vout0x0123);
      vst1q_f32(c0 + 4, vout0x4567);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      c0 = (float*) ((uintptr_t) c0 + cn_stride);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_f32(c0, vout0x0123); c0 += 4;
        vout0x0123 = vout0x4567;
      }
      float32x2_t vout0x01 = vget_low_f32(vout0x0123);
      if (nc & 2) {
        vst1_f32(c0, vout0x01); c0 += 2;
        vout0x01 = vget_high_f32(vout0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c0, vout0x01, 0);
      }
      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    float* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
    const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 2);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  float* c0 = c;
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 2) {
    a1 = a0;
    c1 = c0;
  }

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    const int32x4_t vizp01 = vld1q_s32(&quantization_params[0].zero_point);
    const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x0123 = vmulq_lane_s32(vksum0123, vget_low_s32(vizp01), 0);
    int32x4_t vacc1x0123 = vmulq_lane_s32(vksum0123, vget_high_s32(vizp01), 0);
    const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vmulq_lane_s32(vksum4567, vget_low_s32(vizp01), 0);
    int32x4_t vacc1x4567 = vmulq_lane_s32(vksum4567, vget_high_s32(vizp01), 0);

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
      int8x8_t va1x1 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

    }

    float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567);
    float32x4_t vout1x0123 = vcvtq_f32_s32(vacc1x0123);
    float32x4_t vout1x4567 = vcvtq_f32_s32(vacc1x4567);

    const float32x4_t vinput_scale01 = vreinterpretq_f32_s32(vld1q_s32(&quantization_params[0].zero_point));
    const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vscale0x0123 = vmulq_lane_f32(vbscale0123, vget_low_f32(vinput_scale01), 1);
    const float32x4_t vscale1x0123 = vmulq_lane_f32(vbscale0123, vget_high_f32(vinput_scale01), 1);
    const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    const float32x4_t vscale0x4567 = vmulq_lane_f32(vbscale4567, vget_low_f32(vinput_scale01), 1);
    const float32x4_t vscale1x4567 = vmulq_lane_f32(vbscale4567, vget_high_f32(vinput_scale01), 1);
    const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4;
    #if XNN_ARCH_ARM64
      vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123);
      vout1x0123 = vfmaq_f32(vbias0123, vout1x0123, vscale1x0123);
    #else
      vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123);
      vout1x0123 = vmlaq_f32(vbias0123, vout1x0123, vscale1x0123);
    #endif
    const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4;
    #if XNN_ARCH_ARM64
      vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567);
      vout1x4567 = vfmaq_f32(vbias4567, vout1x4567, vscale1x4567);
    #else
      vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567);
      vout1x4567 = vmlaq_f32(vbias4567, vout1x4567, vscale1x4567);
    #endif

    const float32x4_t voutput_min = vld1q_dup_f32(&params->scalar.min);
    vout0x0123 = vmaxq_f32(vout0x0123, voutput_min);
    vout0x4567 = vmaxq_f32(vout0x4567, voutput_min);
    vout1x0123 = vmaxq_f32(vout1x0123, voutput_min);
    vout1x4567 = vmaxq_f32(vout1x4567, voutput_min);

    const float32x4_t voutput_max = vld1q_dup_f32(&params->scalar.max);
    vout0x0123 = vminq_f32(vout0x0123, voutput_max);
    vout0x4567 = vminq_f32(vout0x4567, voutput_max);
    vout1x0123 = vminq_f32(vout1x0123, voutput_max);
    vout1x4567 = vminq_f32(vout1x4567, voutput_max);

    if XNN_LIKELY(nc >= 8) {
      vst1q_f32(c1, vout1x0123);
      vst1q_f32(c1 + 0, vout1x0123);
      vst1q_f32(c1 + 4, vout1x4567);
      vst1q_f32(c0, vout0x0123);
      vst1q_f32(c0 + 0, vout0x0123);
      vst1q_f32(c0 + 4, vout0x4567);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);

      c0 = (float*) ((uintptr_t) c0 + cn_stride);
      c1 = (float*) ((uintptr_t) c1 + cn_stride);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_f32(c1, vout1x0123); c1 += 4;
        vout1x0123 = vout1x4567;
        vst1q_f32(c0, vout0x0123); c0 += 4;
        vout0x0123 = vout0x4567;
      }
      float32x2_t vout1x01 = vget_low_f32(vout1x0123);
      float32x2_t vout0x01 = vget_low_f32(vout0x0123);
      if (nc & 2) {
        vst1_f32(c1, vout1x01); c1 += 2;
        vst1_f32(c0, vout0x01); c0 += 2;
        vout1x01 = vget_high_f32(vout1x0123);
        vout0x01 = vget_high_f32(vout0x0123);
      }
      if (nc & 1) {
        vst1_lane_f32(c1, vout1x01, 0);
        vst1_lane_f32(c0, vout0x01, 0);
      }
      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs16_qs8_vcvt_ukernel__neon_x32(
    size_t batch,
    const int16_t* input,
    int8_t* output,
    const union xnn_qs16_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int16_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int32x4_t vmultiplier = vld1q_dup_s32(&params->neon.multiplier);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(int16_t); batch -= 32 * sizeof(int16_t)) {
    const int16x8_t vx0 = vld1q_s16(input); input += 8;
    const int16x8_t vx1 = vld1q_s16(input); input += 8;
    const int16x8_t vx2 = vld1q_s16(input); input += 8;
    const int16x8_t vx3 = vld1q_s16(input); input += 8;

    int32x4_t vacc_lo0 = vshll_n_s16(vget_low_s16(vx0), 15);
    int32x4_t vacc_hi0 = vshll_n_s16(vget_high_s16(vx0), 15);
    int32x4_t vacc_lo1 = vshll_n_s16(vget_low_s16(vx1), 15);
    int32x4_t vacc_hi1 = vshll_n_s16(vget_high_s16(vx1), 15);
    int32x4_t vacc_lo2 = vshll_n_s16(vget_low_s16(vx2), 15);
    int32x4_t vacc_hi2 = vshll_n_s16(vget_high_s16(vx2), 15);
    int32x4_t vacc_lo3 = vshll_n_s16(vget_low_s16(vx3), 15);
    int32x4_t vacc_hi3 = vshll_n_s16(vget_high_s16(vx3), 15);

    vacc_lo0 = vqrdmulhq_s32(vacc_lo0, vmultiplier);
    vacc_hi0 = vqrdmulhq_s32(vacc_hi0, vmultiplier);
    vacc_lo1 = vqrdmulhq_s32(vacc_lo1, vmultiplier);
    vacc_hi1 = vqrdmulhq_s32(vacc_hi1, vmultiplier);
    vacc_lo2 = vqrdmulhq_s32(vacc_lo2, vmultiplier);
    vacc_hi2 = vqrdmulhq_s32(vacc_hi2, vmultiplier);
    vacc_lo3 = vqrdmulhq_s32(vacc_lo3, vmultiplier);
    vacc_hi3 = vqrdmulhq_s32(vacc_hi3, vmultiplier);

    int16x8_t vacc0 = vcombine_s16(vqmovn_s32(vacc_lo0), vqmovn_s32(vacc_hi0));
    int16x8_t vacc1 = vcombine_s16(vqmovn_s32(vacc_lo1), vqmovn_s32(vacc_hi1));
    int16x8_t vacc2 = vcombine_s16(vqmovn_s32(vacc_lo2), vqmovn_s32(vacc_hi2));
    int16x8_t vacc3 = vcombine_s16(vqmovn_s32(vacc_lo3), vqmovn_s32(vacc_hi3));

    vacc0 = vqaddq_s16(vacc0, voutput_zero_point);
    vacc1 = vqaddq_s16(vacc1, voutput_zero_point);
    vacc2 = vqaddq_s16(vacc2, voutput_zero_point);
    vacc3 = vqaddq_s16(vacc3, voutput_zero_point);

    const int8x8_t vy0 = vqmovn_s16(vacc0);
    const int8x8_t vy1 = vqmovn_s16(vacc1);
    const int8x8_t vy2 = vqmovn_s16(vacc2);
    const int8x8_t vy3 = vqmovn_s16(vacc3);

    vst1_s8(output, vy0); output += 8;
    vst1_s8(output, vy1); output += 8;
    vst1_s8(output, vy2); output += 8;
    vst1_s8(output, vy3); output += 8;
  }
  for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) {
    const int16x8_t vx = vld1q_s16(input); input += 8;
    int32x4_t vacc_lo = vshll_n_s16(vget_low_s16(vx), 15);
    int32x4_t vacc_hi = vshll_n_s16(vget_high_s16(vx), 15);
    vacc_lo = vqrdmulhq_s32(vacc_lo, vmultiplier);
    vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier);
    int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vacc);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int16_t));
    assert(batch <= 7 * sizeof(int16_t));

    const int16x8_t vx = vld1q_s16(input);
    int32x4_t vacc_lo = vshll_n_s16(vget_low_s16(vx), 15);
    int32x4_t vacc_hi = vshll_n_s16(vget_high_s16(vx), 15);
    vacc_lo = vqrdmulhq_s32(vacc_lo, vmultiplier);
    vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier);
    int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vacc);

    if (batch & (4 * sizeof(int16_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int16_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int16_t))) {
      vst1_lane_s8((void*) output, vy, 0);
    }
  }
}

void xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    const int8_t* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
    }
    const int8_t* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
    }
    const int8_t* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
    }
    const int8_t* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
    }
    const int8_t* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
    }
    const int8_t* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
    }
    const int8_t* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
    }
    const int8_t* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
    }
    const int8_t* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
    }
    const int8_t* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
    }
    const int8_t* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
    }
    const int8_t* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
    }
    const int8_t* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
    }
    const int8_t* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
    }
    const int8_t* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
    }
    const int8_t* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
      int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
      vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
      vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
      vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
      vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF);

      const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi9x89ABCDEF = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi9x89ABCDEF, vk9x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi10x89ABCDEF = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);
      vprod89ABCDEF = vmull_s8(vi10x89ABCDEF, vk10x89ABCDEF);

      const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi11x89ABCDEF = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi11x89ABCDEF, vk11x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi12x89ABCDEF = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);
      vprod89ABCDEF = vmull_s8(vi12x89ABCDEF, vk12x89ABCDEF);

      const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi13x89ABCDEF = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi13x89ABCDEF, vk13x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi14x89ABCDEF = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);
      vprod89ABCDEF = vmull_s8(vi14x89ABCDEF, vk14x89ABCDEF);

      const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi15x89ABCDEF = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi15x89ABCDEF, vk15x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi16x89ABCDEF = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);
      vprod89ABCDEF = vmull_s8(vi16x89ABCDEF, vk16x89ABCDEF);

      const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi17x89ABCDEF = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi17x89ABCDEF, vk17x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi18x89ABCDEF = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);
      vprod89ABCDEF = vmull_s8(vi18x89ABCDEF, vk18x89ABCDEF);

      const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi19x89ABCDEF = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi19x89ABCDEF, vk19x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi20x89ABCDEF = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);
      vprod89ABCDEF = vmull_s8(vi20x89ABCDEF, vk20x89ABCDEF);

      const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi21x89ABCDEF = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi21x89ABCDEF, vk21x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi22x89ABCDEF = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);
      vprod89ABCDEF = vmull_s8(vi22x89ABCDEF, vk22x89ABCDEF);

      const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi23x89ABCDEF = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi23x89ABCDEF, vk23x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi24x89ABCDEF = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);
      vprod89ABCDEF = vmull_s8(vi24x89ABCDEF, vk24x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));

      vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);
      vacc89AB = vqshlq_s32(vacc89AB, vright_pre_shift);
      vaccCDEF = vqshlq_s32(vaccCDEF, vright_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
      vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
      vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
      vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
      vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
      vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
      vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
#endif  // !XNN_ARCH_ARM64

      vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

      vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
        const int8x8_t vk0x01234567 = vld1_s8(k); k += 8;

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
        const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
        const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
        const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
        const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
        const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
        const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
        const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
        const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
        const int8x8_t vk9x01234567 = vld1_s8((const void*) (k + 136));

        vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
        const int8x8_t vk10x01234567 = vld1_s8((const void*) (k + 152));

        vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

        const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
        const int8x8_t vk11x01234567 = vld1_s8((const void*) (k + 168));

        vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
        const int8x8_t vk12x01234567 = vld1_s8((const void*) (k + 184));

        vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

        const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
        const int8x8_t vk13x01234567 = vld1_s8((const void*) (k + 200));

        vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
        const int8x8_t vk14x01234567 = vld1_s8((const void*) (k + 216));

        vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

        const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
        const int8x8_t vk15x01234567 = vld1_s8((const void*) (k + 232));

        vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
        const int8x8_t vk16x01234567 = vld1_s8((const void*) (k + 248));

        vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

        const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
        const int8x8_t vk17x01234567 = vld1_s8((const void*) (k + 264));

        vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
        const int8x8_t vk18x01234567 = vld1_s8((const void*) (k + 280));

        vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

        const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
        const int8x8_t vk19x01234567 = vld1_s8((const void*) (k + 296));

        vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
        const int8x8_t vk20x01234567 = vld1_s8((const void*) (k + 312));

        vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

        const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
        const int8x8_t vk21x01234567 = vld1_s8((const void*) (k + 328));

        vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
        const int8x8_t vk22x01234567 = vld1_s8((const void*) (k + 344));

        vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

        const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
        const int8x8_t vk23x01234567 = vld1_s8((const void*) (k + 360));

        vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
        const int8x8_t vk24x01234567 = vld1_s8((const void*) (k + 376));

        vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
        vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);

        vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
        vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

        vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
        vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
        vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_s8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
            vout01234567 = vext_s8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
            vout01234567 = vext_s8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_s8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
  const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    const int8_t* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
    }
    const int8_t* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
    }
    const int8_t* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
    }
    const int8_t* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
    }
    const int8_t* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
    }
    const int8_t* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
    }
    const int8_t* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
    }
    const int8_t* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
    }
    const int8_t* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
    }
    const int8_t* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
    }
    const int8_t* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
    }
    const int8_t* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
    }
    const int8_t* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
    }
    const int8_t* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
    }
    const int8_t* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
    }
    const int8_t* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 8; c -= 8) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

      const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

      const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

      const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

      const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

      const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

      const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

      const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

      const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

      vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
#endif  // !XNN_ARCH_ARM64

      vout01234567 = vmax_s8(vout01234567, voutput_min);

      vout01234567 = vmin_s8(vout01234567, voutput_max);

      vst1_s8(output, vout01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0);
        const int8x8_t vk0x01234567 = vld1_s8(w);

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1);
        const int8x8_t vk1x01234567 = vld1_s8((const void*) ((const int8_t*) w + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2);
        const int8x8_t vk2x01234567 = vld1_s8((const void*) ((const int8_t*) w + 16));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3);
        const int8x8_t vk3x01234567 = vld1_s8((const void*) ((const int8_t*) w + 24));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4);
        const int8x8_t vk4x01234567 = vld1_s8((const void*) ((const int8_t*) w + 32));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5);
        const int8x8_t vk5x01234567 = vld1_s8((const void*) ((const int8_t*) w + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6);
        const int8x8_t vk6x01234567 = vld1_s8((const void*) ((const int8_t*) w + 48));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7);
        const int8x8_t vk7x01234567 = vld1_s8((const void*) ((const int8_t*) w + 56));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8);
        const int8x8_t vk8x01234567 = vld1_s8((const void*) ((const int8_t*) w + 64));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        const int8x8_t vi9x01234567 = vld1_s8(i9);
        const int8x8_t vk9x01234567 = vld1_s8((const void*) ((const int8_t*) w + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi10x01234567 = vld1_s8(i10);
        const int8x8_t vk10x01234567 = vld1_s8((const void*) ((const int8_t*) w + 80));

        vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

        const int8x8_t vi11x01234567 = vld1_s8(i11);
        const int8x8_t vk11x01234567 = vld1_s8((const void*) ((const int8_t*) w + 88));

        vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi12x01234567 = vld1_s8(i12);
        const int8x8_t vk12x01234567 = vld1_s8((const void*) ((const int8_t*) w + 96));

        vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

        const int8x8_t vi13x01234567 = vld1_s8(i13);
        const int8x8_t vk13x01234567 = vld1_s8((const void*) ((const int8_t*) w + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi14x01234567 = vld1_s8(i14);
        const int8x8_t vk14x01234567 = vld1_s8((const void*) ((const int8_t*) w + 112));

        vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

        const int8x8_t vi15x01234567 = vld1_s8(i15);
        const int8x8_t vk15x01234567 = vld1_s8((const void*) ((const int8_t*) w + 120));

        vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi16x01234567 = vld1_s8(i16);
        const int8x8_t vk16x01234567 = vld1_s8((const void*) ((const int8_t*) w + 128));

        vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

        const int8x8_t vi17x01234567 = vld1_s8(i17);
        const int8x8_t vk17x01234567 = vld1_s8((const void*) ((const int8_t*) w + 136));

        vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi18x01234567 = vld1_s8(i18);
        const int8x8_t vk18x01234567 = vld1_s8((const void*) ((const int8_t*) w + 144));

        vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

        const int8x8_t vi19x01234567 = vld1_s8(i19);
        const int8x8_t vk19x01234567 = vld1_s8((const void*) ((const int8_t*) w + 152));

        vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi20x01234567 = vld1_s8(i20);
        const int8x8_t vk20x01234567 = vld1_s8((const void*) ((const int8_t*) w + 160));

        vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

        const int8x8_t vi21x01234567 = vld1_s8(i21);
        const int8x8_t vk21x01234567 = vld1_s8((const void*) ((const int8_t*) w + 168));

        vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi22x01234567 = vld1_s8(i22);
        const int8x8_t vk22x01234567 = vld1_s8((const void*) ((const int8_t*) w + 176));

        vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

        const int8x8_t vi23x01234567 = vld1_s8(i23);
        const int8x8_t vk23x01234567 = vld1_s8((const void*) ((const int8_t*) w + 184));

        vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi24x01234567 = vld1_s8(i24);
        const int8x8_t vk24x01234567 = vld1_s8((const void*) ((const int8_t*) w + 192));

        vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
        vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);

        vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
        vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

        vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
        vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, voutput_min);
        vout01234567 = vmin_s8(vout01234567, voutput_max);

        if (c & 4) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (c & 1) {
          vst1_lane_s8(output, vout01234567, 0); output += 1;
        }
      }
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
      int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
      vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
      vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
      vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
      vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));

      vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);
      vacc89AB = vqshlq_s32(vacc89AB, vright_pre_shift);
      vaccCDEF = vqshlq_s32(vaccCDEF, vright_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
      vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
      vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
      vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
      vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
      vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
      vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
#endif  // !XNN_ARCH_ARM64

      vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

      vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
        const int8x8_t vk0x01234567 = vld1_s8(k); k += 8;

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
        const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
        const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
        const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
        const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
        const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
        const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
        const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
        const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
        vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);

        vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
        vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

        vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif
        vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
        vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_s8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
            vout01234567 = vext_s8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
            vout01234567 = vext_s8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_s8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_f32_vcvt_ukernel__neon_x32(
    size_t batch,
    const int8_t* input,
    float* output,
    const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vminus_zero_point = vreinterpretq_s16_u32(vld1q_dup_u32((const void*) params->neon.minus_zero_point));
  const float32x4_t vscale = vld1q_dup_f32(&params->neon.scale);
  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x8_t vx01234567 = vld1_s8(input); input += 8;
    const int8x8_t vx89ABCDEF = vld1_s8(input); input += 8;
    const int8x8_t vxGHIJKLMN = vld1_s8(input); input += 8;
    const int8x8_t vxOPQRSTUV = vld1_s8(input); input += 8;

    const int16x8_t vhx01234567 = vaddw_s8(vminus_zero_point, vx01234567);
    const int16x8_t vhx89ABCDEF = vaddw_s8(vminus_zero_point, vx89ABCDEF);
    const int16x8_t vhxGHIJKLMN = vaddw_s8(vminus_zero_point, vxGHIJKLMN);
    const int16x8_t vhxOPQRSTUV = vaddw_s8(vminus_zero_point, vxOPQRSTUV);

    const int32x4_t vwx0123 = vmovl_s16(vget_low_s16(vhx01234567));
    const int32x4_t vwx4567 = vmovl_s16(vget_high_s16(vhx01234567));
    const int32x4_t vwx89AB = vmovl_s16(vget_low_s16(vhx89ABCDEF));
    const int32x4_t vwxCDEF = vmovl_s16(vget_high_s16(vhx89ABCDEF));
    const int32x4_t vwxGHIJ = vmovl_s16(vget_low_s16(vhxGHIJKLMN));
    const int32x4_t vwxKLMN = vmovl_s16(vget_high_s16(vhxGHIJKLMN));
    const int32x4_t vwxOPQR = vmovl_s16(vget_low_s16(vhxOPQRSTUV));
    const int32x4_t vwxSTUV = vmovl_s16(vget_high_s16(vhxOPQRSTUV));

    float32x4_t vy0123 = vcvtq_f32_s32(vwx0123);
    float32x4_t vy4567 = vcvtq_f32_s32(vwx4567);
    float32x4_t vy89AB = vcvtq_f32_s32(vwx89AB);
    float32x4_t vyCDEF = vcvtq_f32_s32(vwxCDEF);
    float32x4_t vyGHIJ = vcvtq_f32_s32(vwxGHIJ);
    float32x4_t vyKLMN = vcvtq_f32_s32(vwxKLMN);
    float32x4_t vyOPQR = vcvtq_f32_s32(vwxOPQR);
    float32x4_t vySTUV = vcvtq_f32_s32(vwxSTUV);

    vy0123 = vmulq_f32(vy0123, vscale);
    vy4567 = vmulq_f32(vy4567, vscale);
    vy89AB = vmulq_f32(vy89AB, vscale);
    vyCDEF = vmulq_f32(vyCDEF, vscale);
    vyGHIJ = vmulq_f32(vyGHIJ, vscale);
    vyKLMN = vmulq_f32(vyKLMN, vscale);
    vyOPQR = vmulq_f32(vyOPQR, vscale);
    vySTUV = vmulq_f32(vySTUV, vscale);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
    vst1q_f32(output, vy89AB); output += 4;
    vst1q_f32(output, vyCDEF); output += 4;
    vst1q_f32(output, vyGHIJ); output += 4;
    vst1q_f32(output, vyKLMN); output += 4;
    vst1q_f32(output, vyOPQR); output += 4;
    vst1q_f32(output, vySTUV); output += 4;
  }
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;

    const int16x8_t vhx = vaddw_s8(vminus_zero_point, vx);

    const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx));
    const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx));

    float32x4_t vy_lo = vcvtq_f32_s32(vwx_lo);
    float32x4_t vy_hi = vcvtq_f32_s32(vwx_hi);

    vy_lo = vmulq_f32(vy_lo, vscale);
    vy_hi = vmulq_f32(vy_hi, vscale);

    vst1q_f32(output, vy_lo); output += 4;
    vst1q_f32(output, vy_hi); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);

    const int16x8_t vhx = vaddw_s8(vminus_zero_point, vx);

    const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx));
    const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx));

    float32x4_t vy = vcvtq_f32_s32(vwx_lo);
    vy = vmulq_f32(vy, vscale);

    if (batch & (4 * sizeof(int8_t))) {
      vst1q_f32(output, vy); output += 4;
      vy = vcvtq_f32_s32(vwx_hi);
      vy = vmulq_f32(vy, vscale);
    }
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(int8_t))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8(
    size_t rows,
    size_t channels,
    const int8_t* input,
    size_t input_stride,
    const int8_t* zero,
    int32_t* buffer,
    int8_t* output,
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows > 7);
  assert(channels != 0);

  const int8_t* i0 = input;
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
  int32_t* b = buffer;
  size_t c = channels;
  for (; c != 0; c = doz(c, 8)) {
    const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
    const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;

    const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
    int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

    const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
    const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
    const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
    const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
    vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

    const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
    const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));

    vst1q_s32(b, vacc0123); b += 4;
    vst1q_s32(b, vacc4567); b += 4;
  }

  for (rows -= 7; rows > 7; rows -= 7) {
    i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
    i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
    i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
    i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
    i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
    i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
    i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);

    int32_t* b = buffer;
    size_t c = channels;
    for (; c != 0; c = doz(c, 8)) {
      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;

      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
      int32x4_t vacc0123 = vld1q_s32(b);
      int32x4_t vacc4567 = vld1q_s32(b + 4);
      vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));

      vst1q_s32(b, vacc0123); b += 4;
      vst1q_s32(b, vacc4567); b += 4;
    }
  }

  i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
  i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
  if XNN_UNPREDICTABLE(rows < 2) {
    i1 = zero;
  }
  i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 2) {
    i2 = zero;
  }
  i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
  if XNN_UNPREDICTABLE(rows < 4) {
    i3 = zero;
  }
  i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 4) {
    i4 = zero;
  }
  i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
  if XNN_UNPREDICTABLE(rows < 6) {
    i5 = zero;
  }
  i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 6) {
    i6 = zero;
  }

  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
  const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
  for (; channels >= 8; channels -= 8) {
    const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
    const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;

    const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
    int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

    const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
    const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
    const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
    const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
    int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
    int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
    vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

    vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
    vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
    #else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    #endif  // !XNN_ARCH_ARM64

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

    #if XNN_ARCH_ARM64
      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
    #else  // !XNN_ARCH_ARM64
      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
    #endif  // !XNN_ARCH_ARM64

    vout01234567 = vmax_s8(vout01234567, voutput_min);

    vout01234567 = vmin_s8(vout01234567, voutput_max);

    vst1_s8(output, vout01234567); output += 8;
  }
  if XNN_UNLIKELY(channels != 0) {
    {
      const int8x8_t vi0x01234567 = vld1_s8(i0);
      const int8x8_t vi1x01234567 = vld1_s8(i1);
      const int8x8_t vi2x01234567 = vld1_s8(i2);
      int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

      const int8x8_t vi3x01234567 = vld1_s8(i3);
      vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
      const int8x8_t vi4x01234567 = vld1_s8(i4);
      vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
      const int8x8_t vi5x01234567 = vld1_s8(i5);
      vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
      const int8x8_t vi6x01234567 = vld1_s8(i6);
      vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
      int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
      int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
      vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif
      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, voutput_min);
      vout01234567 = vmin_s8(vout01234567, voutput_max);

      if (channels & 4) {
        vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
        vout01234567 = vext_s8(vout01234567, vout01234567, 4);
      }
      if (channels & 2) {
        vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
        vout01234567 = vext_s8(vout01234567, vout01234567, 2);
      }
      if (channels & 1) {
        vst1_lane_s8(output, vout01234567, 0);
      }
    }
  }
}

void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8(
    size_t rows,
    size_t channels,
    const int8_t* input,
    size_t input_stride,
    const int8_t* zero,
    int8_t* output,
    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows != 0);
  assert(rows <= 7);
  assert(channels != 0);

  const int8_t* i0 = input;
  const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
  if XNN_UNPREDICTABLE(rows < 2) {
    i1 = zero;
  }
  const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 2) {
    i2 = zero;
  }
  const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
  if XNN_UNPREDICTABLE(rows < 4) {
    i3 = zero;
  }
  const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 4) {
    i4 = zero;
  }
  const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
  if XNN_UNPREDICTABLE(rows < 6) {
    i5 = zero;
  }
  const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 6) {
    i6 = zero;
  }

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
  const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
  for (; channels >= 8; channels -= 8) {
    const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
    const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;

    const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
    int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

    const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
    const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
    const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
    const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
    vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
    vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

    int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
    int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
    #else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    #endif  // !XNN_ARCH_ARM64

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

    #if XNN_ARCH_ARM64
      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
    #else  // !XNN_ARCH_ARM64
      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
    #endif  // !XNN_ARCH_ARM64

    vout01234567 = vmax_s8(vout01234567, voutput_min);

    vout01234567 = vmin_s8(vout01234567, voutput_max);

    vst1_s8(output, vout01234567); output += 8;
  }
  if XNN_UNLIKELY(channels != 0) {
    {
      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
      vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);

      int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
      int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif
      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, voutput_min);
      vout01234567 = vmin_s8(vout01234567, voutput_max);

      if (channels & 4) {
        vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
        vout01234567 = vext_s8(vout01234567, vout01234567, 4);
      }
      if (channels & 2) {
        vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
        vout01234567 = vext_s8(vout01234567, vout01234567, 2);
      }
      if (channels & 1) {
        vst1_lane_s8(output, vout01234567, 0);
      }
    }
  }
}

void xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t k = kc;
    while (k >= 8 * sizeof(int8_t)) {
      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);


      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(int8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
      const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(int8_t)) {
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
          const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(int8_t)) {
            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
            const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
              const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(int8_t)) {
                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
                const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
                  const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

      int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

      int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
    #endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      nc -= 16;
    } else {
      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
      if (nc & 8) {
        vst1_s8(c0, vout0x01234567); c0 += 8;
        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
      }
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t k = kc;
    while (k >= 8 * sizeof(int8_t)) {
      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(int8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(int8_t)) {
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(int8_t)) {
            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(int8_t)) {
                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      nc -= 8;
    } else {
      // Final case where not all of the 8 columns fit in the destination.
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 2);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 2) {
    a1 = a0;
    c1 = c0;
  }

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
      int8x8_t va1x1 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);

      int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);

      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
    #endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);

      nc -= 8;
    } else {
      // Final case where not all of the 8 columns fit in the destination.
      if (nc & 4) {
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 8 * sizeof(int8_t)) {
        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);


        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);

        k -= 8 * sizeof(int8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
        const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);

        if (k >= 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
          const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);

          if (k > 2 * sizeof(int8_t)) {
            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
            const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);

            if (k >= 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
              const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);

              if (k > 4 * sizeof(int8_t)) {
                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
                const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);

                if (k >= 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
                  const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);

                  if (k > 6 * sizeof(int8_t)) {
                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
                    const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                    const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                    vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

    int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

    int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
#endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 16;
    } else {
      int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
      if (nc & 8) {
        vst1_s8(c0, vout0x01234567); c0 += 8;
        vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
      }
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 8 * sizeof(int8_t)) {
        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

        k -= 8 * sizeof(int8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

        if (k >= 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

          if (k > 2 * sizeof(int8_t)) {
            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

            if (k >= 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

              if (k > 4 * sizeof(int8_t)) {
                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

                if (k >= 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                  if (k > 6 * sizeof(int8_t)) {
                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 16 * sizeof(int8_t)) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va0x1 = vld1_s8(a0); a0 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

        k -= 16 * sizeof(int8_t);
      }
      if (k != 0) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

      }

      p -= 1 * sizeof(void*);
    } while (p != 0);

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 2);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (2 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 2) {
    c1 = c0;
  }

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      const int8_t* restrict a1 = a[1];
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
      }
      a += 2;

      size_t k = kc;
      while (k >= 16 * sizeof(int8_t)) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
        int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
        int8x8_t va1x1 = vld1_s8(a1); a1 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

        k -= 16 * sizeof(int8_t);
      }
      if (k != 0) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va1x0 = vld1_s8(a1); a1 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

      }

      p -= 2 * sizeof(void*);
    } while (p != 0);

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);

    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);

    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
#endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));

      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->fp32_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->fp32_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    const int8_t* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
    }
    const int8_t* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
    }
    const int8_t* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
    }
    const int8_t* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
    }
    const int8_t* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
    }
    const int8_t* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
    }
    const int8_t* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
    }
    const int8_t* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
    }
    const int8_t* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
    }
    const int8_t* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
    }
    const int8_t* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
    }
    const int8_t* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
    }
    const int8_t* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
    }
    const int8_t* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
    }
    const int8_t* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
    }
    const int8_t* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
      int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
      vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
      vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
      vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
      vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF);

      const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi9x89ABCDEF = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi9x89ABCDEF, vk9x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi10x89ABCDEF = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);
      vprod89ABCDEF = vmull_s8(vi10x89ABCDEF, vk10x89ABCDEF);

      const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi11x89ABCDEF = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi11x89ABCDEF, vk11x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi12x89ABCDEF = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);
      vprod89ABCDEF = vmull_s8(vi12x89ABCDEF, vk12x89ABCDEF);

      const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi13x89ABCDEF = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi13x89ABCDEF, vk13x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi14x89ABCDEF = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);
      vprod89ABCDEF = vmull_s8(vi14x89ABCDEF, vk14x89ABCDEF);

      const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi15x89ABCDEF = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi15x89ABCDEF, vk15x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi16x89ABCDEF = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);
      vprod89ABCDEF = vmull_s8(vi16x89ABCDEF, vk16x89ABCDEF);

      const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi17x89ABCDEF = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi17x89ABCDEF, vk17x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi18x89ABCDEF = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);
      vprod89ABCDEF = vmull_s8(vi18x89ABCDEF, vk18x89ABCDEF);

      const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi19x89ABCDEF = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi19x89ABCDEF, vk19x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi20x89ABCDEF = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);
      vprod89ABCDEF = vmull_s8(vi20x89ABCDEF, vk20x89ABCDEF);

      const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi21x89ABCDEF = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi21x89ABCDEF, vk21x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi22x89ABCDEF = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);
      vprod89ABCDEF = vmull_s8(vi22x89ABCDEF, vk22x89ABCDEF);

      const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi23x89ABCDEF = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi23x89ABCDEF, vk23x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi24x89ABCDEF = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);
      vprod89ABCDEF = vmull_s8(vi24x89ABCDEF, vk24x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));

      float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
      float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
      float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
      float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);

      const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4;

      vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
      vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);
      vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB);
      vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF);

      vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
      vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
      vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
      vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));

      vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
      vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
      vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
      vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);


      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));


      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
#endif  // !XNN_ARCH_ARM64

      vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

      vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
        const int8x8_t vk0x01234567 = vld1_s8(k); k += 8;

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
        const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
        const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
        const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
        const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
        const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
        const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
        const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
        const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
        const int8x8_t vk9x01234567 = vld1_s8((const void*) (k + 136));

        vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
        const int8x8_t vk10x01234567 = vld1_s8((const void*) (k + 152));

        vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

        const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
        const int8x8_t vk11x01234567 = vld1_s8((const void*) (k + 168));

        vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
        const int8x8_t vk12x01234567 = vld1_s8((const void*) (k + 184));

        vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

        const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
        const int8x8_t vk13x01234567 = vld1_s8((const void*) (k + 200));

        vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
        const int8x8_t vk14x01234567 = vld1_s8((const void*) (k + 216));

        vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

        const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
        const int8x8_t vk15x01234567 = vld1_s8((const void*) (k + 232));

        vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
        const int8x8_t vk16x01234567 = vld1_s8((const void*) (k + 248));

        vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

        const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
        const int8x8_t vk17x01234567 = vld1_s8((const void*) (k + 264));

        vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
        const int8x8_t vk18x01234567 = vld1_s8((const void*) (k + 280));

        vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

        const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
        const int8x8_t vk19x01234567 = vld1_s8((const void*) (k + 296));

        vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
        const int8x8_t vk20x01234567 = vld1_s8((const void*) (k + 312));

        vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

        const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
        const int8x8_t vk21x01234567 = vld1_s8((const void*) (k + 328));

        vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
        const int8x8_t vk22x01234567 = vld1_s8((const void*) (k + 344));

        vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

        const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
        const int8x8_t vk23x01234567 = vld1_s8((const void*) (k + 360));

        vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
        const int8x8_t vk24x01234567 = vld1_s8((const void*) (k + 376));

        vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
        float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);

        const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 400 * sizeof(int8_t)));
        const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 400 * sizeof(int8_t) + 4 * sizeof(float)));
        vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
        vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);

        vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
        vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));

        vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
        vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
        vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_s8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
            vout01234567 = vext_s8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
            vout01234567 = vext_s8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_s8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const int8x8_t voutput_min = vld1_dup_s8(&params->fp32_neon.output_min);
  const int8x8_t voutput_max = vld1_dup_s8(&params->fp32_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    const int8_t* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
    }
    const int8_t* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
    }
    const int8_t* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
    }
    const int8_t* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
    }
    const int8_t* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
    }
    const int8_t* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
    }
    const int8_t* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
    }
    const int8_t* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
    }
    const int8_t* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
    }
    const int8_t* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
    }
    const int8_t* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
    }
    const int8_t* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
    }
    const int8_t* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
    }
    const int8_t* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
    }
    const int8_t* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
    }
    const int8_t* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 8; c -= 8) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

      const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
      const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
      const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

      const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
      const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
      const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

      const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
      const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
      const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

      const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
      const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
      const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

      const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
      const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
      const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

      const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
      const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
      const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

      const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
      const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
      const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

      const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
      const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
      const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

      float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
      float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);

      const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4;

      vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
      vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);

      vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
      vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));

      vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
      vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);


      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));


      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
#endif  // !XNN_ARCH_ARM64

      vout01234567 = vmax_s8(vout01234567, voutput_min);

      vout01234567 = vmin_s8(vout01234567, voutput_max);

      vst1_s8(output, vout01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0);
        const int8x8_t vk0x01234567 = vld1_s8(w);

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1);
        const int8x8_t vk1x01234567 = vld1_s8((const void*) ((const int8_t*) w + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2);
        const int8x8_t vk2x01234567 = vld1_s8((const void*) ((const int8_t*) w + 16));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3);
        const int8x8_t vk3x01234567 = vld1_s8((const void*) ((const int8_t*) w + 24));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4);
        const int8x8_t vk4x01234567 = vld1_s8((const void*) ((const int8_t*) w + 32));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5);
        const int8x8_t vk5x01234567 = vld1_s8((const void*) ((const int8_t*) w + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6);
        const int8x8_t vk6x01234567 = vld1_s8((const void*) ((const int8_t*) w + 48));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7);
        const int8x8_t vk7x01234567 = vld1_s8((const void*) ((const int8_t*) w + 56));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8);
        const int8x8_t vk8x01234567 = vld1_s8((const void*) ((const int8_t*) w + 64));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        const int8x8_t vi9x01234567 = vld1_s8(i9);
        const int8x8_t vk9x01234567 = vld1_s8((const void*) ((const int8_t*) w + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi10x01234567 = vld1_s8(i10);
        const int8x8_t vk10x01234567 = vld1_s8((const void*) ((const int8_t*) w + 80));

        vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);

        const int8x8_t vi11x01234567 = vld1_s8(i11);
        const int8x8_t vk11x01234567 = vld1_s8((const void*) ((const int8_t*) w + 88));

        vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi12x01234567 = vld1_s8(i12);
        const int8x8_t vk12x01234567 = vld1_s8((const void*) ((const int8_t*) w + 96));

        vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);

        const int8x8_t vi13x01234567 = vld1_s8(i13);
        const int8x8_t vk13x01234567 = vld1_s8((const void*) ((const int8_t*) w + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi14x01234567 = vld1_s8(i14);
        const int8x8_t vk14x01234567 = vld1_s8((const void*) ((const int8_t*) w + 112));

        vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);

        const int8x8_t vi15x01234567 = vld1_s8(i15);
        const int8x8_t vk15x01234567 = vld1_s8((const void*) ((const int8_t*) w + 120));

        vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi16x01234567 = vld1_s8(i16);
        const int8x8_t vk16x01234567 = vld1_s8((const void*) ((const int8_t*) w + 128));

        vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);

        const int8x8_t vi17x01234567 = vld1_s8(i17);
        const int8x8_t vk17x01234567 = vld1_s8((const void*) ((const int8_t*) w + 136));

        vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi18x01234567 = vld1_s8(i18);
        const int8x8_t vk18x01234567 = vld1_s8((const void*) ((const int8_t*) w + 144));

        vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);

        const int8x8_t vi19x01234567 = vld1_s8(i19);
        const int8x8_t vk19x01234567 = vld1_s8((const void*) ((const int8_t*) w + 152));

        vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi20x01234567 = vld1_s8(i20);
        const int8x8_t vk20x01234567 = vld1_s8((const void*) ((const int8_t*) w + 160));

        vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);

        const int8x8_t vi21x01234567 = vld1_s8(i21);
        const int8x8_t vk21x01234567 = vld1_s8((const void*) ((const int8_t*) w + 168));

        vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi22x01234567 = vld1_s8(i22);
        const int8x8_t vk22x01234567 = vld1_s8((const void*) ((const int8_t*) w + 176));

        vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);

        const int8x8_t vi23x01234567 = vld1_s8(i23);
        const int8x8_t vk23x01234567 = vld1_s8((const void*) ((const int8_t*) w + 184));

        vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi24x01234567 = vld1_s8(i24);
        const int8x8_t vk24x01234567 = vld1_s8((const void*) ((const int8_t*) w + 192));

        vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
        float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);

        const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 0 * sizeof(int32_t) + 200 * sizeof(int8_t)));
        const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 0 * sizeof(int32_t) + 200 * sizeof(int8_t) + 4 * sizeof(float)));
        vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
        vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);

        vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
        vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));

        vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
        vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, voutput_min);
        vout01234567 = vmin_s8(vout01234567, voutput_max);

        if (c & 4) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (c & 1) {
          vst1_lane_s8(output, vout01234567, 0); output += 1;
        }
      }
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->fp32_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->fp32_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x16_t vi0x0123456789ABCDEF = vld1q_s8(i0); i0 += 16;
      const int8x16_t vk0x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16;

      int16x8_t vprod01234567 = vmull_s8(vget_low_s8(vi0x0123456789ABCDEF), vget_low_s8(vk0x0123456789ABCDEF));
      int16x8_t vprod89ABCDEF = vmull_s8(vget_high_s8(vi0x0123456789ABCDEF), vget_high_s8(vk0x0123456789ABCDEF));

      const int8x16_t vi1x0123456789ABCDEF = vld1q_s8(i1); i1 += 16;
      const int8x16_t vk1x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16;

      vprod01234567 = vmlal_s8(vprod01234567, vget_low_s8(vi1x0123456789ABCDEF), vget_low_s8(vk1x0123456789ABCDEF));
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vget_high_s8(vi1x0123456789ABCDEF), vget_high_s8(vk1x0123456789ABCDEF));

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x16_t vi2x0123456789ABCDEF = vld1q_s8(i2); i2 += 16;
      const int8x16_t vk2x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16;

      vprod01234567 = vmull_s8(vget_low_s8(vi2x0123456789ABCDEF), vget_low_s8(vk2x0123456789ABCDEF));
      vprod89ABCDEF = vmull_s8(vget_high_s8(vi2x0123456789ABCDEF), vget_high_s8(vk2x0123456789ABCDEF));

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));

      float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
      float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
      float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
      float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);

      const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4;

      vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
      vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);
      vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB);
      vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF);

      vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
      vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
      vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
      vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));

      vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
      vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
      vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
      vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);


      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));


      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
#endif  // !XNN_ARCH_ARM64

      vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

      vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
        const int8x8_t vk0x01234567 = vld1_s8(k); k += 8;

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
        const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
        const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
        float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);

        const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
        const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t) + 4 * sizeof(float)));
        vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
        vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);

        vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
        vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));

        vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
        vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
        vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_s8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
            vout01234567 = vext_s8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
            vout01234567 = vext_s8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_s8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64(
    size_t channels,
    size_t output_width,
    const int8_t** input,
    const void* weights,
    int8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->fp32_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->fp32_neon.output_max);
  do {
    const int8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
    }
    const int8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
    }
    const int8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
    }
    const int8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
    }
    const int8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
    }
    const int8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
    }
    const int8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
    }
    const int8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
    }
    const int8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
    }
    input = (const int8_t**) ((uintptr_t) input + input_stride);

    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;

      const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
      const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
      int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF);

      const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
      const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
      const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
      vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF);

      const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
      const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
      const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
      vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF);

      const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
      const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
      const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
      vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF);

      const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8;
      const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
      vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));
      const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8;
      const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
      vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF);

      vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
      vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
      vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF));
      vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF));

      float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
      float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
      float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
      float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);

      const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4;
      const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4;

      vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
      vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);
      vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB);
      vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF);

      vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
      vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
      vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
      vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));

      vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
      vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
      vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
      vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);


      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
#else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));


      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
#endif  // !XNN_ARCH_ARM64

      vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

      vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
        const int8x8_t vk0x01234567 = vld1_s8(k); k += 8;

        int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);

        const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
        const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8));

        vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
        const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24));

        vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);

        const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
        const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40));

        vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
        const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56));

        vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);

        const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
        const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72));

        vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
        const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88));

        vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);

        const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
        const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104));

        vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
        const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
        const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120));

        vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);

        vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
        vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));

        float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
        float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);

        const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
        const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t) + 4 * sizeof(float)));
        vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123);
        vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567);

        vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
        vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));

        vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
        vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
#else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
#endif

        int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
        vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
        vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_s8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
            vout01234567 = vext_s8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
            vout01234567 = vext_s8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_s8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t k = kc;
    while (k >= 8 * sizeof(int8_t)) {
      const int8x8_t va0 = vld1_s8(a0); a0 += 8;
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


      const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(int8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vmovl_s8(va0);

      const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(int8_t)) {
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(int8_t)) {
            const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(int8_t)) {
                const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);

    const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);

    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);


      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));


      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

    }

    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);

    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);

    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);


      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));


      int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
    #endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);

      nc -= 8;
    } else {
      // Final case where not all of the 8 columns fit in the destination.
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    const int8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 2);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const int8_t* a0 = a;
  int8_t* c0 = c;
  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 2) {
    a1 = a0;
    c1 = c0;
  }

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;

    size_t k = kc;
    while (k >= 16 * sizeof(int8_t)) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
      int8x8_t va1x1 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
      vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
      vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
      vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
      vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
      vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
      vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va0x1 = vext_s8(va0x1, va0x1, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      va1x1 = vext_s8(va1x1, va1x1, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
      vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
      vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
      vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

      k -= 16 * sizeof(int8_t);
    }
    if (k != 0) {
      int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
      int8x8_t va1x0 = vld1_s8(a1); a1 += 8;

      const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
      const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

      int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
      int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
      int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
      int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
      int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
      int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
      int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
      int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
      int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
      int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
      va0x0 = vext_s8(va0x0, va0x0, 2);
      va1x0 = vext_s8(va1x0, va1x0, 2);
      int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
      int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
      vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
      vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
      int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
      int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
      vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
      vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

    }

    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
    float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
    float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);

    const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4;
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4;
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);
    vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
    vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
    vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
    vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point);
    vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point);

    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);


      int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));


      int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
    #endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);

      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
      a1 = (const int8_t*) ((uintptr_t) a1 - kc);

      nc -= 8;
    } else {
      // Final case where not all of the 8 columns fit in the destination.
      if (nc & 4) {
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;

  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 8 * sizeof(int8_t)) {
        const int8x8_t va0 = vld1_s8(a0); a0 += 8;
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


        const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

        k -= 8 * sizeof(int8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vmovl_s8(va0);

        const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

        if (k >= 2 * sizeof(int8_t)) {
          const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

          if (k > 2 * sizeof(int8_t)) {
            const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

            if (k >= 4 * sizeof(int8_t)) {
              const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

              if (k > 4 * sizeof(int8_t)) {
                const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

                if (k >= 6 * sizeof(int8_t)) {
                  const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                  if (k > 6 * sizeof(int8_t)) {
                    const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);

    const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);


    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));


    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 16 * sizeof(int8_t)) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va0x1 = vld1_s8(a0); a0 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

        k -= 16 * sizeof(int8_t);
      }
      if (k != 0) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);

      }

      p -= 1 * sizeof(void*);
    } while (p != 0);

    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);

    const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);


    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));


    int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#endif

    const int8x8_t voutput_min = vld1_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);

    const int8x8_t voutput_max = vld1_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c0 + 0, vout0x01234567);

      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_s8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const int8_t** restrict a,
    const void* restrict w,
    int8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const int8_t* zero,
    const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 2);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (2 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(int8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  int8_t* c0 = c;
  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 2) {
    c1 = c0;
  }

  kc = round_up_po2(kc, 8 * sizeof(int8_t));
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;

    size_t p = ks;
    do {
      const int8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
      }
      const int8_t* restrict a1 = a[1];
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
      }
      a += 2;

      size_t k = kc;
      while (k >= 16 * sizeof(int8_t)) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
        int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
        int8x8_t va1x1 = vld1_s8(a1); a1 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
        const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
        vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
        const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
        vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
        const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
        vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
        const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
        vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
        const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
        vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
        const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
        vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va0x1 = vext_s8(va0x1, va0x1, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        va1x1 = vext_s8(va1x1, va1x1, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
        const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
        vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
        const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8;
        vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
        vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

        k -= 16 * sizeof(int8_t);
      }
      if (k != 0) {
        int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
        int8x8_t va1x0 = vld1_s8(a1); a1 += 8;

        const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
        const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;

        int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
        int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
        int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
        int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
        int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
        int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
        int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
        int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
        int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
        int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
        va0x0 = vext_s8(va0x0, va0x0, 2);
        va1x0 = vext_s8(va1x0, va1x0, 2);
        int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
        int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
        vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
        vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
        int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
        int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
        vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
        vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);

      }

      p -= 2 * sizeof(void*);
    } while (p != 0);

    float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
    float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
    float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
    float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);

    const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
    vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123);
    const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
    vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);
    vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567);

    const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
    vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
    vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
    vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
    vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));

    const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
    vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
    vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
    vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point);
    vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point);

#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);


    int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));


    int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
#endif

    const int8x16_t voutput_min = vld1q_dup_s8(&params->fp32_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);

    const int8x16_t voutput_max = vld1q_dup_s8(&params->fp32_neon.output_max);
    vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);

    if (nc >= 8) {
      vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
      vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));

      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const int8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
        vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
        vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(&params->neon.a_zero_point);
  const int8x8_t vb_zero_point = vld1_dup_s8(&params->neon.b_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);

  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);
    const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point);

    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);

    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
    vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
    vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);

    int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
      const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
      const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);

      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(&params->neon.a_zero_point);
  const int8x8_t vb_zero_point = vld1_dup_s8(&params->neon.b_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);

  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8;
    const int8x8_t vaGHIJKLMN = vld1_s8(input_a); input_a += 8;
    const int8x8_t vbGHIJKLMN = vld1_s8(input_b); input_b += 8;
    const int8x8_t vaOPQRSTUV = vld1_s8(input_a); input_a += 8;
    const int8x8_t vbOPQRSTUV = vld1_s8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);
    const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point);
    const int16x8_t vxaGHIJKLMN = vsubl_s8(vaGHIJKLMN, va_zero_point);
    const int16x8_t vxbGHIJKLMN = vsubl_s8(vbGHIJKLMN, vb_zero_point);
    const int16x8_t vxaOPQRSTUV = vsubl_s8(vaOPQRSTUV, va_zero_point);
    const int16x8_t vxbOPQRSTUV = vsubl_s8(vbOPQRSTUV, vb_zero_point);

    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccGHIJ = vmulq_s32(vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccKLMN = vmulq_s32(vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccOPQR = vmulq_s32(vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
    int32x4_t vaccSTUV = vmulq_s32(vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);

    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
    vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
    vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
    vaccGHIJ = vmlaq_s32(vaccGHIJ, vmovl_s16(vget_low_s16(vxbGHIJKLMN)), vb_multiplier);
    vaccKLMN = vmlaq_s32(vaccKLMN, vmovl_s16(vget_high_s16(vxbGHIJKLMN)), vb_multiplier);
    vaccOPQR = vmlaq_s32(vaccOPQR, vmovl_s16(vget_low_s16(vxbOPQRSTUV)), vb_multiplier);
    vaccSTUV = vmlaq_s32(vaccSTUV, vmovl_s16(vget_high_s16(vxbOPQRSTUV)), vb_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
    vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
    vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
    vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
    vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
    const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
    const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);

    int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
    int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
    voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
    voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
      const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
      const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);

      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(&params->neon.a_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);

  const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
  const int32_t vb = params->neon.b_multiplier;
  const int32x4_t vbias = vdupq_n_s32(vxb * vb);

  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);

    int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);

    int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);

      int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(&params->neon.a_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);

  const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
  const int32_t vb = params->neon.b_multiplier;
  const int32x4_t vbias = vdupq_n_s32(vxb * vb);

  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;
    const int8x8_t vaGHIJKLMN = vld1_s8(input_a); input_a += 8;
    const int8x8_t vaOPQRSTUV = vld1_s8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);
    const int16x8_t vxaGHIJKLMN = vsubl_s8(vaGHIJKLMN, va_zero_point);
    const int16x8_t vxaOPQRSTUV = vsubl_s8(vaOPQRSTUV, va_zero_point);

    int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccGHIJ = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccKLMN = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccOPQR = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
    int32x4_t vaccSTUV = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
    vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
    vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
    vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
    vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
    const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
    const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);

    int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
    int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
    voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
    voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
    vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);

      int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qs8_vcvt_ukernel__neon_x32(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vinput_zero_point = vld1q_dup_s16(&params->neon.input_zero_point);
  const int16x8_t vmultiplier = vld1q_dup_s16(&params->neon.multiplier);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x16_t vx0 = vld1q_s8(input); input += 16;
    const int8x16_t vx1 = vld1q_s8(input); input += 16;

    int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0));
    int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0));
    int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1));
    int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1));

    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);

    vacc0 = vqrdmulhq_s16(vacc0, vmultiplier);
    vacc1 = vqrdmulhq_s16(vacc1, vmultiplier);
    vacc2 = vqrdmulhq_s16(vacc2, vmultiplier);
    vacc3 = vqrdmulhq_s16(vacc3, vmultiplier);

    vacc0 = vqaddq_s16(vacc0, voutput_zero_point);
    vacc1 = vqaddq_s16(vacc1, voutput_zero_point);
    vacc2 = vqaddq_s16(vacc2, voutput_zero_point);
    vacc3 = vqaddq_s16(vacc3, voutput_zero_point);

    const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc1));
    const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vacc2), vqmovn_s16(vacc3));

    vst1q_s8(output, vy0); output += 16;
    vst1q_s8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vacc);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vacc);

    if (batch & (4 * sizeof(int8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_qs8_vhswish_ukernel__neon_x16(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vinput_zero_point = vld1q_dup_s16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
    const int8x16_t vx0 = vld1q_s8(input); input += 16;
    int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0));
    int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0));
    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa);
    int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa);
    vin0 = vqshlq_s16(vin0, vinput_scale_div_exp);
    vin1 = vqshlq_s16(vin1, vinput_scale_div_exp);
    vin0 = vqsubq_s16(vin0, vhalf);
    vin1 = vqsubq_s16(vin1, vhalf);
    vin0 = vminq_s16(vin0, vzero);
    vin1 = vminq_s16(vin1, vzero);
    int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio);
    int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio);
    vout0 = vqdmulhq_s16(vout0, vin0);
    vout1 = vqdmulhq_s16(vout1, vin1);
    vout0 = vqaddq_s16(vout0, voutput_zero_point);
    vout1 = vqaddq_s16(vout1, voutput_zero_point);
    const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vout0), vqmovn_s16(vout1));
    vst1q_s8(output, vy0); output += 16;
  }
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vout);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vout);

    if (batch & (4 * sizeof(int8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_qs8_vhswish_ukernel__neon_x32(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vinput_zero_point = vld1q_dup_s16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x16_t vx0 = vld1q_s8(input); input += 16;
    const int8x16_t vx1 = vld1q_s8(input); input += 16;
    int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0));
    int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0));
    int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1));
    int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1));
    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);
    int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa);
    int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa);
    int16x8_t vin2 = vqdmulhq_s16(vacc2, vinput_scale_div_mantissa);
    int16x8_t vin3 = vqdmulhq_s16(vacc3, vinput_scale_div_mantissa);
    vin0 = vqshlq_s16(vin0, vinput_scale_div_exp);
    vin1 = vqshlq_s16(vin1, vinput_scale_div_exp);
    vin2 = vqshlq_s16(vin2, vinput_scale_div_exp);
    vin3 = vqshlq_s16(vin3, vinput_scale_div_exp);
    vin0 = vqsubq_s16(vin0, vhalf);
    vin1 = vqsubq_s16(vin1, vhalf);
    vin2 = vqsubq_s16(vin2, vhalf);
    vin3 = vqsubq_s16(vin3, vhalf);
    vin0 = vminq_s16(vin0, vzero);
    vin1 = vminq_s16(vin1, vzero);
    vin2 = vminq_s16(vin2, vzero);
    vin3 = vminq_s16(vin3, vzero);
    int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio);
    int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio);
    int16x8_t vout2 = vqdmulhq_s16(vacc2, vscale_ratio);
    int16x8_t vout3 = vqdmulhq_s16(vacc3, vscale_ratio);
    vout0 = vqdmulhq_s16(vout0, vin0);
    vout1 = vqdmulhq_s16(vout1, vin1);
    vout2 = vqdmulhq_s16(vout2, vin2);
    vout3 = vqdmulhq_s16(vout3, vin3);
    vout0 = vqaddq_s16(vout0, voutput_zero_point);
    vout1 = vqaddq_s16(vout1, voutput_zero_point);
    vout2 = vqaddq_s16(vout2, voutput_zero_point);
    vout3 = vqaddq_s16(vout3, voutput_zero_point);
    const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vout0), vqmovn_s16(vout1));
    const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vout2), vqmovn_s16(vout3));
    vst1q_s8(output, vy0); output += 16;
    vst1q_s8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vout);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vout);

    if (batch & (4 * sizeof(int8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_qs8_vhswish_ukernel__neon_x8(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vinput_zero_point = vld1q_dup_s16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vout);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vout);

    if (batch & (4 * sizeof(int8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_qs8_vlrelu_ukernel__neon_x32(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vinput_zero_point = vld1q_dup_s16(&params->neon.input_zero_point);
  const int16x8_t vpositive_multiplier = vld1q_dup_s16(&params->neon.positive_multiplier);
  const int16x8_t vnegative_multiplier = vld1q_dup_s16(&params->neon.negative_multiplier);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) {
    const int8x16_t vx0 = vld1q_s8(input); input += 16;
    const int8x16_t vx1 = vld1q_s8(input); input += 16;

    int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0));
    int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0));
    int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1));
    int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1));

    const uint16x8_t vmask0 = vcltq_s16(vacc0, vmovq_n_s16(0));
    const uint16x8_t vmask1 = vcltq_s16(vacc1, vmovq_n_s16(0));
    const uint16x8_t vmask2 = vcltq_s16(vacc2, vmovq_n_s16(0));
    const uint16x8_t vmask3 = vcltq_s16(vacc3, vmovq_n_s16(0));

    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);

    const int16x8_t vmultiplier0 = vbslq_s16(vmask0, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier1 = vbslq_s16(vmask1, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier2 = vbslq_s16(vmask2, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier3 = vbslq_s16(vmask3, vpositive_multiplier, vnegative_multiplier);

    vacc0 = vqrdmulhq_s16(vacc0, vmultiplier0);
    vacc1 = vqrdmulhq_s16(vacc1, vmultiplier1);
    vacc2 = vqrdmulhq_s16(vacc2, vmultiplier2);
    vacc3 = vqrdmulhq_s16(vacc3, vmultiplier3);

    vacc0 = vqaddq_s16(vacc0, voutput_zero_point);
    vacc1 = vqaddq_s16(vacc1, voutput_zero_point);
    vacc2 = vqaddq_s16(vacc2, voutput_zero_point);
    vacc3 = vqaddq_s16(vacc3, voutput_zero_point);

    const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc1));
    const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vacc2), vqmovn_s16(vacc3));

    vst1q_s8(output, vy0); output += 16;
    vst1q_s8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) {
    const int8x8_t vx = vld1_s8(input); input += 8;
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0));
    vacc = vshlq_n_s16(vacc, 7);
    const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    const int8x8_t vy = vqmovn_s16(vacc);
    vst1_s8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(int8_t));
    assert(batch <= 7 * sizeof(int8_t));

    const int8x8_t vx = vld1_s8(input);
    int16x8_t vacc = vsubw_s8(vinput_zero_point, vx);
    const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0));
    vacc = vshlq_n_s16(vacc, 7);
    const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    int8x8_t vy = vqmovn_s16(vacc);

    if (batch & (4 * sizeof(int8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4;
      vy = vext_s8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(int8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2;
      vy = vext_s8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(int8_t))) {
      vst1_lane_s8(output, vy, 0);
    }
  }
}

void xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(params->rndnu_neon.a_zero_point);
  const int8x8_t vb_zero_point = vld1_dup_s8(params->rndnu_neon.b_zero_point);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);

  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;
    const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);
    const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point);

    int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567));
    int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567));
    int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb89ABCDEF));
    int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb89ABCDEF));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
    vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift);
    vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
    vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
    vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
    #else
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    #endif

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
    vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

    #if XNN_ARCH_ARM64
      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
    #else
      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
    #endif

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
      const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
      const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);

      int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567));
      int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);

      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16(
    size_t batch,
    const int8_t* input_a,
    const int8_t* input_b,
    int8_t* output,
    const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const int8x8_t va_zero_point = vld1_dup_s8(params->rndnu_neon.a_zero_point);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
  const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);

  const int8x8_t vb = vld1_dup_s8(input_b);
  const int8x8_t vb_zero_point = vld1_dup_s8(params->rndnu_neon.b_zero_point);
  const int16x8_t vxb = vsubl_s8(vb, vb_zero_point);
  for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) {
    const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
    const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
    const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point);

    int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb));
    int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb));
    int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb));
    int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
    vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift);
    vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
    vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
    vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
    #else
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    #endif

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
    vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

    #if XNN_ARCH_ARM64
      int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
    #else
      int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
    #endif

    vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);

    vst1q_s8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);

      int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb));
      int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      int8x8_t vout01234567 = vqmovn_s16(vacc01234567);

      vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
      vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
      if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) {
        vst1_s8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(int8_t);
      } else {
        if (batch & (4 * sizeof(int8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
          vout01234567 = vext_s8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(int8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
          vout01234567 = vext_s8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(int8_t))) {
          vst1_lane_s8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__neon_c8(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const uint8_t** input,
    size_t input_offset,
    const uint8_t* zero,
    int32_t* buffer,
    uint8_t* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements > 9);
  assert(channels != 0);

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
  const float32x4_t vscale = vld1q_dup_f32(&params->fp32_neon.scale);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const uint8x8_t voutput_min = vld1_dup_u8(&params->fp32_neon.output_min);
  const uint8x8_t voutput_max = vld1_dup_u8(&params->fp32_neon.output_max);

  do {
    {
      const uint8_t* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
      }
      const uint8_t* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
      }
      const uint8_t* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
      }
      const uint8_t* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
      }
      const uint8_t* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
      }
      const uint8_t* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
      }
      const uint8_t* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
      }
      const uint8_t* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
      }
      const uint8_t* i8 = *input++;
      assert(i8 != NULL);
      if XNN_UNPREDICTABLE(i8 != zero) {
        i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
      }

      int32_t* b = buffer;
      for (size_t c = 0; c < channels; c += 8) {
        const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
        const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
        const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
        const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
        const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
        const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
        const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
        const uint8x8_t vi7 = vld1_u8(i7); i7 += 8;
        const uint8x8_t vi8 = vld1_u8(i8); i8 += 8;

        const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8);
        const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
        const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
        const uint16x8_t vsum67 = vaddl_u8(vi6, vi7);

        const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
        const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67);
        const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678);

        const int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum)));
        const int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum)));

        vst1q_s32(b, vacc_lo); b += 4;
        vst1q_s32(b, vacc_hi); b += 4;
      }
    }

    size_t k = kernel_elements;
    for (k -= 9; k > 8; k -= 8) {
      const uint8_t* i0 = *input++;
      assert(i0 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
      }
      const uint8_t* i1 = *input++;
      assert(i1 != NULL);
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
      }
      const uint8_t* i2 = *input++;
      assert(i2 != NULL);
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
      }
      const uint8_t* i3 = *input++;
      assert(i3 != NULL);
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
      }
      const uint8_t* i4 = *input++;
      assert(i4 != NULL);
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
      }
      const uint8_t* i5 = *input++;
      assert(i5 != NULL);
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
      }
      const uint8_t* i6 = *input++;
      assert(i6 != NULL);
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
      }
      const uint8_t* i7 = *input++;
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
      }

      int32_t* b = buffer;
      for (size_t c = 0; c < channels; c += 8) {
        const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
        const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
        const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
        const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
        const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
        const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
        const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
        const uint8x8_t vi7 = vld1_u8(i7); i7 += 8;
        int32x4_t vacc_lo = vld1q_s32(b);
        int32x4_t vacc_hi = vld1q_s32(b + 4);

        const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);
        const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
        const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
        const uint16x8_t vsum67 = vaddl_u8(vi6, vi7);

        const uint16x8_t vsum0123 = vaddq_u16(vsum01, vsum23);
        const uint16x8_t vsum4567 = vaddq_u16(vsum45, vsum67);
        const uint16x8_t vsum = vaddq_u16(vsum0123, vsum4567);

        vacc_lo = vaddw_s16(vacc_lo, vreinterpret_s16_u16(vget_low_u16(vsum)));
        vacc_hi = vaddw_s16(vacc_hi, vreinterpret_s16_u16(vget_high_u16(vsum)));

        vst1q_s32(b, vacc_lo); b += 4;
        vst1q_s32(b, vacc_hi); b += 4;
      }
    }

    {
      const uint8_t* i0 = input[0];
      assert(i0 != NULL);
      const uint8_t* i1 = input[1];
      const uint8_t* i2 = input[2];
      const uint8_t* i3 = input[3];
      const uint8_t* i4 = input[4];
      const uint8_t* i5 = input[5];
      const uint8_t* i6 = input[6];
      const uint8_t* i7 = input[7];
      input = (const uint8_t**) ((uintptr_t) input + input_increment);
      if (k < 2) {
        i1 = zero;
      }
      assert(i1 != NULL);
      if (k <= 2) {
        i2 = zero;
      }
      assert(i2 != NULL);
      if (k < 4) {
        i3 = zero;
      }
      assert(i3 != NULL);
      if (k <= 4) {
        i4 = zero;
      }
      assert(i4 != NULL);
      if (k < 6) {
        i5 = zero;
      }
      assert(i5 != NULL);
      if (k <= 6) {
        i6 = zero;
      }
      assert(i6 != NULL);
      if (k < 8) {
        i7 = zero;
      }
      assert(i7 != NULL);
      if XNN_UNPREDICTABLE(i0 != zero) {
        i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
      }
      if XNN_UNPREDICTABLE(i1 != zero) {
        i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
      }
      if XNN_UNPREDICTABLE(i2 != zero) {
        i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
      }
      if XNN_UNPREDICTABLE(i3 != zero) {
        i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
      }
      if XNN_UNPREDICTABLE(i4 != zero) {
        i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
      }
      if XNN_UNPREDICTABLE(i5 != zero) {
        i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
      }
      if XNN_UNPREDICTABLE(i6 != zero) {
        i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
      }
      if XNN_UNPREDICTABLE(i7 != zero) {
        i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
      }

      size_t c = channels;
      int32_t* b = buffer;
      while (c >= 8) {
        const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
        const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
        const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
        const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
        const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
        const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
        const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
        const uint8x8_t vi7 = vld1_u8(i7); i7 += 8;
        int32x4_t vacc_lo = vld1q_s32(b); b += 4;
        int32x4_t vacc_hi = vld1q_s32(b); b += 4;

        const int16x8_t vsum01 = vreinterpretq_s16_u16(vaddl_u8(vi0, vi1));
        const int16x8_t vsum23 = vreinterpretq_s16_u16(vaddl_u8(vi2, vi3));
        const int16x8_t vsum45 = vreinterpretq_s16_u16(vaddl_u8(vi4, vi5));
        const int16x8_t vsum67 = vreinterpretq_s16_u16(vaddl_u8(vi6, vi7));

        const int16x8_t vsum0123 = vaddq_s16(vsum01, vsum23);
        const int16x8_t vsum4567 = vaddq_s16(vsum45, vsum67);
        const int16x8_t vsum = vaddq_s16(vsum0123, vsum4567);

        vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
        vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));

        float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo);
        float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi);

        vfpacc_lo = vmulq_f32(vfpacc_lo, vscale);
        vfpacc_hi = vmulq_f32(vfpacc_hi, vscale);

        vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias));
        vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias));

        vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point);
        vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point);

        #if XNN_ARCH_ARM64
          int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi);
        #else
          int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
        #endif

        uint8x8_t vout = vqmovun_s16(vacc);
        vout = vmax_u8(vout, voutput_min);
        vout = vmin_u8(vout, voutput_max);

        vst1_u8(output, vout); output += 8;

        c -= 8;
      }
      if (c != 0) {
        const uint8x8_t vi0 = vld1_u8(i0);
        const uint8x8_t vi1 = vld1_u8(i1);
        const uint8x8_t vi2 = vld1_u8(i2);
        const uint8x8_t vi3 = vld1_u8(i3);
        const uint8x8_t vi4 = vld1_u8(i4);
        const uint8x8_t vi5 = vld1_u8(i5);
        const uint8x8_t vi6 = vld1_u8(i6);
        const uint8x8_t vi7 = vld1_u8(i7);
        int32x4_t vacc_lo = vld1q_s32(b); b += 4;
        int32x4_t vacc_hi = vld1q_s32(b);

        const int16x8_t vsum01 = vreinterpretq_s16_u16(vaddl_u8(vi0, vi1));
        const int16x8_t vsum23 = vreinterpretq_s16_u16(vaddl_u8(vi2, vi3));
        const int16x8_t vsum45 = vreinterpretq_s16_u16(vaddl_u8(vi4, vi5));
        const int16x8_t vsum67 = vreinterpretq_s16_u16(vaddl_u8(vi6, vi7));

        const int16x8_t vsum0123 = vaddq_s16(vsum01, vsum23);
        const int16x8_t vsum4567 = vaddq_s16(vsum45, vsum67);
        const int16x8_t vsum = vaddq_s16(vsum0123, vsum4567);

        vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
        vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));

        float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo);
        float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi);

        vfpacc_lo = vmulq_f32(vfpacc_lo, vscale);
        vfpacc_hi = vmulq_f32(vfpacc_hi, vscale);

        vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias));
        vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias));

        vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point);
        vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point);

        #if XNN_ARCH_ARM64
          int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi);
        #else
          int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
        #endif

        uint8x8_t vout = vqmovun_s16(vacc);
        vout = vmax_u8(vout, voutput_min);
        vout = vmin_u8(vout, voutput_max);

        if (c & 4) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4;
          vout = vext_u8(vout, vout, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2;
          vout = vext_u8(vout, vout, 2);
        }
        if (c & 1) {
          vst1_lane_u8(output, vout, 0); output += 1;
        }
      }
    }
    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_qu8_avgpool_minmax_fp32_ukernel_9x__neon_c8(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const uint8_t** input,
    size_t input_offset,
    const uint8_t* zero,
    uint8_t* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(kernel_elements <= 9);
  assert(channels != 0);

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
  const float32x4_t vscale = vld1q_dup_f32(&params->fp32_neon.scale);
  const float32x4_t vmagic_bias = vld1q_dup_f32(&params->fp32_neon.magic_bias);
  const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(&params->fp32_neon.magic_bias_less_output_zero_point);
  const uint8x8_t voutput_min = vld1_dup_u8(&params->fp32_neon.output_min);
  const uint8x8_t voutput_max = vld1_dup_u8(&params->fp32_neon.output_max);

  do {
    const uint8_t* i0 = input[0];
    assert(i0 != NULL);
    const uint8_t* i1 = input[1];
    const uint8_t* i2 = input[2];
    const uint8_t* i3 = input[3];
    const uint8_t* i4 = input[4];
    const uint8_t* i5 = input[5];
    const uint8_t* i6 = input[6];
    const uint8_t* i7 = input[7];
    const uint8_t* i8 = input[8];
    input = (const uint8_t**) ((uintptr_t) input + input_increment);
    if (kernel_elements < 2) {
      i1 = zero;
    }
    assert(i1 != NULL);
    if (kernel_elements <= 2) {
      i2 = zero;
    }
    assert(i2 != NULL);
    if (kernel_elements < 4) {
      i3 = zero;
    }
    assert(i3 != NULL);
    if (kernel_elements <= 4) {
      i4 = zero;
    }
    assert(i4 != NULL);
    if (kernel_elements < 6) {
      i5 = zero;
    }
    assert(i5 != NULL);
    if (kernel_elements <= 6) {
      i6 = zero;
    }
    assert(i6 != NULL);
    if (kernel_elements < 8) {
      i7 = zero;
    }
    assert(i7 != NULL);
    if (kernel_elements <= 8) {
      i8 = zero;
    }
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
    }
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
    }
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
    }
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
    }
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
    }
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
    }
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
    }
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
    }
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
    }

    size_t c = channels;
    while (c >= 8) {
      const uint8x8_t vi0 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vi1 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vi2 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vi3 = vld1_u8(i3); i3 += 8;
      const uint8x8_t vi4 = vld1_u8(i4); i4 += 8;
      const uint8x8_t vi5 = vld1_u8(i5); i5 += 8;
      const uint8x8_t vi6 = vld1_u8(i6); i6 += 8;
      const uint8x8_t vi7 = vld1_u8(i7); i7 += 8;
      const uint8x8_t vi8 = vld1_u8(i8); i8 += 8;

      const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8);
      const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
      const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
      const uint16x8_t vsum67 = vaddl_u8(vi6, vi7);

      const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
      const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67);
      const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678);

      int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum)));
      int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum)));

      float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo);
      float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi);

      vfpacc_lo = vmulq_f32(vfpacc_lo, vscale);
      vfpacc_hi = vmulq_f32(vfpacc_hi, vscale);

      vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias));
      vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias));

      vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point);
      vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point);

      #if XNN_ARCH_ARM64
        int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi);
      #else
        int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
      #endif

      uint8x8_t vout = vqmovun_s16(vacc);
      vout = vmax_u8(vout, voutput_min);
      vout = vmin_u8(vout, voutput_max);

      vst1_u8(output, vout); output += 8;

      c -= 8;
    }
    if (c != 0) {
      const uint8x8_t vi0 = vld1_u8(i0);
      const uint8x8_t vi1 = vld1_u8(i1);
      const uint8x8_t vi2 = vld1_u8(i2);
      const uint8x8_t vi3 = vld1_u8(i3);
      const uint8x8_t vi4 = vld1_u8(i4);
      const uint8x8_t vi5 = vld1_u8(i5);
      const uint8x8_t vi6 = vld1_u8(i6);
      const uint8x8_t vi7 = vld1_u8(i7);
      const uint8x8_t vi8 = vld1_u8(i8);

      const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8);
      const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);
      const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);
      const uint16x8_t vsum67 = vaddl_u8(vi6, vi7);

      const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);
      const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67);
      const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678);

      int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum)));
      int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum)));

      float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo);
      float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi);

      vfpacc_lo = vmulq_f32(vfpacc_lo, vscale);
      vfpacc_hi = vmulq_f32(vfpacc_hi, vscale);

      vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias));
      vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias));

      vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point);
      vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point);

      #if XNN_ARCH_ARM64
        int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi);
      #else
        int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi));
      #endif

      uint8x8_t vout = vqmovun_s16(vacc);
      vout = vmax_u8(vout, voutput_min);
      vout = vmin_u8(vout, voutput_max);

      if (c & 4) {
        vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4;
        vout = vext_u8(vout, vout, 4);
      }
      if (c & 2) {
        vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2;
        vout = vext_u8(vout, vout, 2);
      }
      if (c & 1) {
        vst1_lane_u8(output, vout, 0); output += 1;
      }
    }
    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8(
    size_t channels,
    size_t output_width,
    const uint8_t** input,
    const void* weights,
    uint8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const uint16x8_t vkernel_zero_point = vmovl_u8(vld1_dup_u8(params->rndnu_neon.kernel_zero_point));
  const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x8_t voutput_min = vld1_dup_u8(&params->rndnu_neon.output_min);
  const uint8x8_t voutput_max = vld1_dup_u8(&params->rndnu_neon.output_max);
  do {
    const uint8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
    }
    const uint8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
    }
    const uint8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
    }
    const uint8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
    }
    const uint8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
    }
    const uint8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
    }
    const uint8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
    }
    const uint8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
    }
    const uint8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
    }
    const uint8_t* i9 = input[9];
    assert(i9 != NULL);
    if XNN_UNPREDICTABLE(i9 != zero) {
      i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
    }
    const uint8_t* i10 = input[10];
    assert(i10 != NULL);
    if XNN_UNPREDICTABLE(i10 != zero) {
      i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
    }
    const uint8_t* i11 = input[11];
    assert(i11 != NULL);
    if XNN_UNPREDICTABLE(i11 != zero) {
      i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
    }
    const uint8_t* i12 = input[12];
    assert(i12 != NULL);
    if XNN_UNPREDICTABLE(i12 != zero) {
      i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
    }
    const uint8_t* i13 = input[13];
    assert(i13 != NULL);
    if XNN_UNPREDICTABLE(i13 != zero) {
      i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
    }
    const uint8_t* i14 = input[14];
    assert(i14 != NULL);
    if XNN_UNPREDICTABLE(i14 != zero) {
      i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
    }
    const uint8_t* i15 = input[15];
    assert(i15 != NULL);
    if XNN_UNPREDICTABLE(i15 != zero) {
      i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
    }
    const uint8_t* i16 = input[16];
    assert(i16 != NULL);
    if XNN_UNPREDICTABLE(i16 != zero) {
      i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
    }
    const uint8_t* i17 = input[17];
    assert(i17 != NULL);
    if XNN_UNPREDICTABLE(i17 != zero) {
      i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
    }
    const uint8_t* i18 = input[18];
    assert(i18 != NULL);
    if XNN_UNPREDICTABLE(i18 != zero) {
      i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
    }
    const uint8_t* i19 = input[19];
    assert(i19 != NULL);
    if XNN_UNPREDICTABLE(i19 != zero) {
      i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
    }
    const uint8_t* i20 = input[20];
    assert(i20 != NULL);
    if XNN_UNPREDICTABLE(i20 != zero) {
      i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
    }
    const uint8_t* i21 = input[21];
    assert(i21 != NULL);
    if XNN_UNPREDICTABLE(i21 != zero) {
      i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
    }
    const uint8_t* i22 = input[22];
    assert(i22 != NULL);
    if XNN_UNPREDICTABLE(i22 != zero) {
      i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
    }
    const uint8_t* i23 = input[23];
    assert(i23 != NULL);
    if XNN_UNPREDICTABLE(i23 != zero) {
      i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
    }
    const uint8_t* i24 = input[24];
    assert(i24 != NULL);
    if XNN_UNPREDICTABLE(i24 != zero) {
      i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
    }
    input = (const uint8_t**) ((uintptr_t) input + input_stride);


    size_t c = channels;
    const void* w = weights;
    for (; c >= 8; c -= 8) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;


      const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
      uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
      const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
      const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
      const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
      const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
      const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
      const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8;
      const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8;
      const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8;
      const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8;
      const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8;
      const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8;
      const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8;
      const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8;
      const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8;
      const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8;
      const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8;
      const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8;
      const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8;
      const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8;
      const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8;
      const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8;
      const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));

      vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point)));
      vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point)));

      vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
      vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
      const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#else
      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#endif

      vout01234567 = vmax_u8(vout01234567, voutput_min);

      vout01234567 = vmin_u8(vout01234567, voutput_max);

      vst1_u8(output, vout01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const uint8x8_t vi0x01234567 = vld1_u8(i0);
        const uint8x8_t vk0x01234567 = vld1_u8(w);

        uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi1x01234567 = vld1_u8(i1);
        const uint8x8_t vk1x01234567 = vld1_u8((const uint8_t*) w + 8);

        vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
        uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi2x01234567 = vld1_u8(i2);
        const uint8x8_t vk2x01234567 = vld1_u8((const uint8_t*) w + 16);

        vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi3x01234567 = vld1_u8(i3);
        const uint8x8_t vk3x01234567 = vld1_u8((const uint8_t*) w + 24);

        vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi4x01234567 = vld1_u8(i4);
        const uint8x8_t vk4x01234567 = vld1_u8((const uint8_t*) w + 32);

        vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi5x01234567 = vld1_u8(i5);
        const uint8x8_t vk5x01234567 = vld1_u8((const uint8_t*) w + 40);

        vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi6x01234567 = vld1_u8(i6);
        const uint8x8_t vk6x01234567 = vld1_u8((const uint8_t*) w + 48);

        vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi7x01234567 = vld1_u8(i7);
        const uint8x8_t vk7x01234567 = vld1_u8((const uint8_t*) w + 56);

        vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi8x01234567 = vld1_u8(i8);
        const uint8x8_t vk8x01234567 = vld1_u8((const uint8_t*) w + 64);

        vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi9x01234567 = vld1_u8(i9);
        const uint8x8_t vk9x01234567 = vld1_u8((const uint8_t*) w + 72);

        vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi10x01234567 = vld1_u8(i10);
        const uint8x8_t vk10x01234567 = vld1_u8((const uint8_t*) w + 80);

        vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi11x01234567 = vld1_u8(i11);
        const uint8x8_t vk11x01234567 = vld1_u8((const uint8_t*) w + 88);

        vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi12x01234567 = vld1_u8(i12);
        const uint8x8_t vk12x01234567 = vld1_u8((const uint8_t*) w + 96);

        vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi13x01234567 = vld1_u8(i13);
        const uint8x8_t vk13x01234567 = vld1_u8((const uint8_t*) w + 104);

        vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi14x01234567 = vld1_u8(i14);
        const uint8x8_t vk14x01234567 = vld1_u8((const uint8_t*) w + 112);

        vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi15x01234567 = vld1_u8(i15);
        const uint8x8_t vk15x01234567 = vld1_u8((const uint8_t*) w + 120);

        vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi16x01234567 = vld1_u8(i16);
        const uint8x8_t vk16x01234567 = vld1_u8((const uint8_t*) w + 128);

        vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi17x01234567 = vld1_u8(i17);
        const uint8x8_t vk17x01234567 = vld1_u8((const uint8_t*) w + 136);

        vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi18x01234567 = vld1_u8(i18);
        const uint8x8_t vk18x01234567 = vld1_u8((const uint8_t*) w + 144);

        vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi19x01234567 = vld1_u8(i19);
        const uint8x8_t vk19x01234567 = vld1_u8((const uint8_t*) w + 152);

        vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi20x01234567 = vld1_u8(i20);
        const uint8x8_t vk20x01234567 = vld1_u8((const uint8_t*) w + 160);

        vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi21x01234567 = vld1_u8(i21);
        const uint8x8_t vk21x01234567 = vld1_u8((const uint8_t*) w + 168);

        vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi22x01234567 = vld1_u8(i22);
        const uint8x8_t vk22x01234567 = vld1_u8((const uint8_t*) w + 176);

        vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi23x01234567 = vld1_u8(i23);
        const uint8x8_t vk23x01234567 = vld1_u8((const uint8_t*) w + 184);

        vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi24x01234567 = vld1_u8(i24);
        const uint8x8_t vk24x01234567 = vld1_u8((const uint8_t*) w + 192);

        vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));

        vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point)));
        vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point)));

        vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);

        vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
        vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

        vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
        uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#else
        const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
        uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#endif

        vout01234567 = vmax_u8(vout01234567, voutput_min);
        vout01234567 = vmin_u8(vout01234567, voutput_max);

        if (c & 4) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (c & 1) {
          vst1_lane_u8(output, vout01234567, 0); output += 1;
        }
      }
    }

    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8(
    size_t channels,
    size_t output_width,
    const uint8_t** input,
    const void* weights,
    uint8_t* output,
    intptr_t input_stride,
    size_t output_increment,
    size_t input_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(channels != 0);
  assert(output_width != 0);

  const uint16x8_t vkernel_zero_point = vmovl_u8(vld1_dup_u8(params->rndnu_neon.kernel_zero_point));
  const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
  do {
    const uint8_t* i0 = input[0];
    assert(i0 != NULL);
    if XNN_UNPREDICTABLE(i0 != zero) {
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
    }
    const uint8_t* i1 = input[1];
    assert(i1 != NULL);
    if XNN_UNPREDICTABLE(i1 != zero) {
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
    }
    const uint8_t* i2 = input[2];
    assert(i2 != NULL);
    if XNN_UNPREDICTABLE(i2 != zero) {
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
    }
    const uint8_t* i3 = input[3];
    assert(i3 != NULL);
    if XNN_UNPREDICTABLE(i3 != zero) {
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
    }
    const uint8_t* i4 = input[4];
    assert(i4 != NULL);
    if XNN_UNPREDICTABLE(i4 != zero) {
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
    }
    const uint8_t* i5 = input[5];
    assert(i5 != NULL);
    if XNN_UNPREDICTABLE(i5 != zero) {
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
    }
    const uint8_t* i6 = input[6];
    assert(i6 != NULL);
    if XNN_UNPREDICTABLE(i6 != zero) {
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
    }
    const uint8_t* i7 = input[7];
    assert(i7 != NULL);
    if XNN_UNPREDICTABLE(i7 != zero) {
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
    }
    const uint8_t* i8 = input[8];
    assert(i8 != NULL);
    if XNN_UNPREDICTABLE(i8 != zero) {
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
    }
    input = (const uint8_t**) ((uintptr_t) input + input_stride);


    size_t c = channels;
    const void* w = weights;
    for (; c >= 16; c -= 16) {
      int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
      int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;


      const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
      const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);
      uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
      const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
      uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
      vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF);
      uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
      const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
      vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
      const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
      const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
      vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
      const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
      const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
      vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
      const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
      const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
      vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
      const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
      const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
      vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
      const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8;
      const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);
      vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
      const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
      const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const int8_t*) w + 8;
      const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8;
      const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8;

      vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);
      vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF);
      vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
      vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
      vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));

      vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point)));
      vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point)));
      vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point)));
      vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point)));

      vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
      vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);
      vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift);
      vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
      vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
      vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
      vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
      vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);

#if XNN_ARCH_ARM64
      const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
      const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point);

      uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
#else
      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
      const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);

      uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
#endif

      vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);

      vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);

      vst1q_u8(output, vout0123456789ABCDEF); output += 16;
    }
    if XNN_UNLIKELY(c != 0) {
      const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
      do {
        int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
        int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4;

        const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
        const uint8x8_t vk0x01234567 = vld1_u8(k); k += 8;

        uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
        const uint8x8_t vk1x01234567 = vld1_u8((const void*) (k + 8));

        vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
        uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
        const uint8x8_t vk2x01234567 = vld1_u8((const void*) (k + 24));

        vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
        const uint8x8_t vk3x01234567 = vld1_u8((const void*) (k + 40));

        vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
        const uint8x8_t vk4x01234567 = vld1_u8((const void*) (k + 56));

        vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
        const uint8x8_t vk5x01234567 = vld1_u8((const void*) (k + 72));

        vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
        const uint8x8_t vk6x01234567 = vld1_u8((const void*) (k + 88));

        vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
        const uint8x8_t vk7x01234567 = vld1_u8((const void*) (k + 104));

        vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
        const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
        const uint8x8_t vk8x01234567 = vld1_u8((const void*) (k + 120));

        vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
        vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);

        vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
        vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));

        vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point)));
        vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point)));

        vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);

        vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
        vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

        vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
        vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);

#if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
        uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#else
        const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
        uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
#endif

        vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
        vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));

        if XNN_LIKELY(c >= 8) {
          vst1_u8(output, vout01234567); output += 8;
          c -= 8;
        } else {
          if (c & 4) {
            vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
            vout01234567 = vext_u8(vout01234567, vout01234567, 4);
          }
          if (c & 2) {
            vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
            vout01234567 = vext_u8(vout01234567, vout01234567, 2);
          }
          if (c & 1) {
            vst1_lane_u8(output, vout01234567, 0); output += 1;
          }
          c = 0;
        }
      } while (c != 0);
    }

    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_width != 0);
}

void xnn_qu8_f32_vcvt_ukernel__neon_x32(
    size_t batch,
    const uint8_t* input,
    float* output,
    const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int16x8_t vminus_zero_point = vreinterpretq_s16_u32(vld1q_dup_u32((const void*) params->neon.minus_zero_point));
  const float32x4_t vscale = vld1q_dup_f32(&params->neon.scale);
  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x8_t vx01234567 = vld1_u8(input); input += 8;
    const uint8x8_t vx89ABCDEF = vld1_u8(input); input += 8;
    const uint8x8_t vxGHIJKLMN = vld1_u8(input); input += 8;
    const uint8x8_t vxOPQRSTUV = vld1_u8(input); input += 8;

    const int16x8_t vhx01234567 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx01234567));
    const int16x8_t vhx89ABCDEF = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx89ABCDEF));
    const int16x8_t vhxGHIJKLMN = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vxGHIJKLMN));
    const int16x8_t vhxOPQRSTUV = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vxOPQRSTUV));

    const int32x4_t vwx0123 = vmovl_s16(vget_low_s16(vhx01234567));
    const int32x4_t vwx4567 = vmovl_s16(vget_high_s16(vhx01234567));
    const int32x4_t vwx89AB = vmovl_s16(vget_low_s16(vhx89ABCDEF));
    const int32x4_t vwxCDEF = vmovl_s16(vget_high_s16(vhx89ABCDEF));
    const int32x4_t vwxGHIJ = vmovl_s16(vget_low_s16(vhxGHIJKLMN));
    const int32x4_t vwxKLMN = vmovl_s16(vget_high_s16(vhxGHIJKLMN));
    const int32x4_t vwxOPQR = vmovl_s16(vget_low_s16(vhxOPQRSTUV));
    const int32x4_t vwxSTUV = vmovl_s16(vget_high_s16(vhxOPQRSTUV));

    float32x4_t vy0123 = vcvtq_f32_s32(vwx0123);
    float32x4_t vy4567 = vcvtq_f32_s32(vwx4567);
    float32x4_t vy89AB = vcvtq_f32_s32(vwx89AB);
    float32x4_t vyCDEF = vcvtq_f32_s32(vwxCDEF);
    float32x4_t vyGHIJ = vcvtq_f32_s32(vwxGHIJ);
    float32x4_t vyKLMN = vcvtq_f32_s32(vwxKLMN);
    float32x4_t vyOPQR = vcvtq_f32_s32(vwxOPQR);
    float32x4_t vySTUV = vcvtq_f32_s32(vwxSTUV);

    vy0123 = vmulq_f32(vy0123, vscale);
    vy4567 = vmulq_f32(vy4567, vscale);
    vy89AB = vmulq_f32(vy89AB, vscale);
    vyCDEF = vmulq_f32(vyCDEF, vscale);
    vyGHIJ = vmulq_f32(vyGHIJ, vscale);
    vyKLMN = vmulq_f32(vyKLMN, vscale);
    vyOPQR = vmulq_f32(vyOPQR, vscale);
    vySTUV = vmulq_f32(vySTUV, vscale);

    vst1q_f32(output, vy0123); output += 4;
    vst1q_f32(output, vy4567); output += 4;
    vst1q_f32(output, vy89AB); output += 4;
    vst1q_f32(output, vyCDEF); output += 4;
    vst1q_f32(output, vyGHIJ); output += 4;
    vst1q_f32(output, vyKLMN); output += 4;
    vst1q_f32(output, vyOPQR); output += 4;
    vst1q_f32(output, vySTUV); output += 4;
  }
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;

    const int16x8_t vhx = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx));

    const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx));
    const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx));

    float32x4_t vy_lo = vcvtq_f32_s32(vwx_lo);
    float32x4_t vy_hi = vcvtq_f32_s32(vwx_hi);

    vy_lo = vmulq_f32(vy_lo, vscale);
    vy_hi = vmulq_f32(vy_hi, vscale);

    vst1q_f32(output, vy_lo); output += 4;
    vst1q_f32(output, vy_hi); output += 4;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);

    const int16x8_t vhx = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx));

    const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx));
    const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx));

    float32x4_t vy = vcvtq_f32_s32(vwx_lo);
    vy = vmulq_f32(vy, vscale);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1q_f32(output, vy); output += 4;
      vy = vcvtq_f32_s32(vwx_hi);
      vy = vmulq_f32(vy, vscale);
    }
    float32x2_t vy_lo = vget_low_f32(vy);
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_f32(output, vy_lo); output += 2;
      vy_lo = vget_high_f32(vy);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_f32(output, vy_lo, 0);
    }
  }
}

void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8(
    size_t rows,
    size_t channels,
    const uint8_t* input,
    size_t input_stride,
    const uint8_t* zero,
    int32_t* buffer,
    uint8_t* output,
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows > 7);
  assert(channels != 0);

  const uint8_t* i0 = input;
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
  int32_t* b = buffer;
  size_t c = channels;
  for (; c != 0; c = doz(c, 8)) {
    const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
    const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;

    const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
    uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

    const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
    const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
    const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
    const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
    vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

    const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
    const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));

    vst1q_s32(b, vacc0123); b += 4;
    vst1q_s32(b, vacc4567); b += 4;
  }

  for (rows -= 7; rows > 7; rows -= 7) {
    i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
    i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
    i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
    i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
    i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
    i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
    i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);

    int32_t* b = buffer;
    size_t c = channels;
    for (; c != 0; c = doz(c, 8)) {
      const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;

      const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
      uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

      const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
      const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
      const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
      const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
      int32x4_t vacc0123 = vld1q_s32(b);
      int32x4_t vacc4567 = vld1q_s32(b + 4);
      vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));

      vst1q_s32(b, vacc0123); b += 4;
      vst1q_s32(b, vacc4567); b += 4;
    }
  }

  i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
  i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
  if XNN_UNPREDICTABLE(rows < 2) {
    i1 = zero;
  }
  i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 2) {
    i2 = zero;
  }
  i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
  if XNN_UNPREDICTABLE(rows < 4) {
    i3 = zero;
  }
  i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 4) {
    i4 = zero;
  }
  i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
  if XNN_UNPREDICTABLE(rows < 6) {
    i5 = zero;
  }
  i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
  if XNN_UNPREDICTABLE(rows <= 6) {
    i6 = zero;
  }

  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x8_t voutput_min = vld1_dup_u8(&params->rndnu_neon.output_min);
  const uint8x8_t voutput_max = vld1_dup_u8(&params->rndnu_neon.output_max);
  for (; channels >= 8; channels -= 8) {
    const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
    const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;

    const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
    uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

    const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
    const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
    const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
    const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
    int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
    int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
    vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

    vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
    vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
    #else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    #endif  // !XNN_ARCH_ARM64

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

    #if XNN_ARCH_ARM64
      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
    #else  // !XNN_ARCH_ARM64
      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
    #endif  // !XNN_ARCH_ARM64

    vout01234567 = vmax_u8(vout01234567, voutput_min);

    vout01234567 = vmin_u8(vout01234567, voutput_max);

    vst1_u8(output, vout01234567); output += 8;
  }
  if XNN_UNLIKELY(channels != 0) {
    {
      const uint8x8_t vi0x01234567 = vld1_u8(i0);
      const uint8x8_t vi1x01234567 = vld1_u8(i1);
      const uint8x8_t vi2x01234567 = vld1_u8(i2);
      uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

      const uint8x8_t vi3x01234567 = vld1_u8(i3);
      vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
      const uint8x8_t vi4x01234567 = vld1_u8(i4);
      vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
      const uint8x8_t vi5x01234567 = vld1_u8(i5);
      vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
      const uint8x8_t vi6x01234567 = vld1_u8(i6);
      vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
      int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
      int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
      vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

      vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
      vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif
      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, voutput_min);
      vout01234567 = vmin_u8(vout01234567, voutput_max);

      if (channels & 4) {
        vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
        vout01234567 = vext_u8(vout01234567, vout01234567, 4);
      }
      if (channels & 2) {
        vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
        vout01234567 = vext_u8(vout01234567, vout01234567, 2);
      }
      if (channels & 1) {
        vst1_lane_u8(output, vout01234567, 0);
      }
    }
  }
}

void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8(
    size_t rows,
    size_t channels,
    const uint8_t* input,
    size_t input_stride,
    const uint8_t* zero,
    uint8_t* output,
    const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(rows != 0);
  assert(rows <= 7);
  assert(channels != 0);

  const uint8_t* i0 = input;
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
  if XNN_UNPREDICTABLE(rows < 2) {
    i1 = zero;
  }
  const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 2) {
    i2 = zero;
  }
  const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
  if XNN_UNPREDICTABLE(rows < 4) {
    i3 = zero;
  }
  const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 4) {
    i4 = zero;
  }
  const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
  if XNN_UNPREDICTABLE(rows < 6) {
    i5 = zero;
  }
  const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
  if XNN_UNPREDICTABLE(rows <= 6) {
    i6 = zero;
  }

  const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x8_t voutput_min = vld1_dup_u8(&params->rndnu_neon.output_min);
  const uint8x8_t voutput_max = vld1_dup_u8(&params->rndnu_neon.output_max);
  for (; channels >= 8; channels -= 8) {
    const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
    const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;

    const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
    uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

    const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
    const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
    const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
    const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
    vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
    vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

    int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
    int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
    #else  // !XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
    #endif  // !XNN_ARCH_ARM64

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

    #if XNN_ARCH_ARM64
      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
    #else  // !XNN_ARCH_ARM64
      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
    #endif  // !XNN_ARCH_ARM64

    vout01234567 = vmax_u8(vout01234567, voutput_min);

    vout01234567 = vmin_u8(vout01234567, voutput_max);

    vst1_u8(output, vout01234567); output += 8;
  }
  if XNN_UNLIKELY(channels != 0) {
    {
      const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
      uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);

      const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
      const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
      const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
      const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
      vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
      vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);

      int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
      int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif
      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, voutput_min);
      vout01234567 = vmin_u8(vout01234567, voutput_max);

      if (channels & 4) {
        vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
        vout01234567 = vext_u8(vout01234567, vout01234567, 4);
      }
      if (channels & 2) {
        vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
        vout01234567 = vext_u8(vout01234567, vout01234567, 2);
      }
      if (channels & 1) {
        vst1_lane_u8(output, vout01234567, 0);
      }
    }
  }
}

void xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const uint8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const uint8_t* a0 = a;
  uint8_t* c0 = c;

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t k = kc;
    while (k >= 8 * sizeof(uint8_t)) {
      const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);


      const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(uint8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
      const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(uint8_t)) {
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
        const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
          const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
            const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
              const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
                const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
                  const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

      uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

      uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
    #endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);

      nc -= 16;
    } else {
      uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF);
      if (nc & 8) {
        vst1_u8(c0, vout0x01234567); c0 += 8;
        vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF);
      }
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_u8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const uint8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const uint8_t* a0 = a;
  uint8_t* c0 = c;

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t k = kc;
    while (k >= 8 * sizeof(uint8_t)) {
      const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


      const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

      k -= 8 * sizeof(uint8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

      if (k >= 2 * sizeof(uint8_t)) {
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

        if (k > 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

          if (k >= 4 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

            if (k > 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

              if (k >= 6 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                if (k > 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

      uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
    #endif

    const uint8x8_t voutput_min = vld1_dup_u8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_u8(vout0x01234567, voutput_min);

    const uint8x8_t voutput_max = vld1_dup_u8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_u8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_u8(c0 + 0, vout0x01234567);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_u8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const uint8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 3);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const uint8_t* a0 = a;
  uint8_t* c0 = c;
  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;
    int32x4_t vacc2x0123 = vacc0x0123;
    int32x4_t vacc2x4567 = vacc0x4567;

    size_t k = kc;
    while (k >= 8 * sizeof(uint8_t)) {
      const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
      const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
      const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
      const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
      const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
      const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
      const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);


      const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
      const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
      const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
      const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);

      k -= 8 * sizeof(uint8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
      const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
      const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
      const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
      const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);

      if (k >= 2 * sizeof(uint8_t)) {
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);

        if (k > 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);

          if (k >= 4 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);

            if (k > 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);

              if (k >= 6 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);

                if (k > 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
    vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
    vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
    vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
    vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
      int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
      vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);

      uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
      uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
      int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
      vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);

      uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
      uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567);
    #endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
    vout2x01234567 = vmax_u8(vout2x01234567, vget_low_u8(voutput_min));

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
    vout2x01234567 = vmin_u8(vout2x01234567, vget_low_u8(voutput_max));

    if (nc >= 8) {
      vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567));
      vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567));
      vst1_u8(c2 + 0, vout2x01234567);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);

      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
      a2 = (const uint8_t*) ((uintptr_t) a2 - kc);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1_lane_u32((void*) c2, vreinterpret_u32_u8(vout2x01234567), 0); c2 += 4;
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
        vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1_lane_u16((void*) c2, vreinterpret_u16_u8(vout2x01234567), 0); c2 += 2;
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
        vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
        vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
        vst1_lane_u8(c2, vout2x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    const uint8_t* restrict a,
    size_t a_stride,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(kc % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  const uint8_t* a0 = a;
  uint8_t* c0 = c;
  const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    a1 = a0;
    c1 = c0;
  }
  const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    a2 = a1;
    c2 = c1;
  }
  const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
  uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    a3 = a2;
    c3 = c2;
  }

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;
    int32x4_t vacc1x89AB = vacc0x89AB;
    int32x4_t vacc1xCDEF = vacc0xCDEF;
    int32x4_t vacc2x0123 = vacc0x0123;
    int32x4_t vacc2x4567 = vacc0x4567;
    int32x4_t vacc2x89AB = vacc0x89AB;
    int32x4_t vacc2xCDEF = vacc0xCDEF;
    int32x4_t vacc3x0123 = vacc0x0123;
    int32x4_t vacc3x4567 = vacc0x4567;
    int32x4_t vacc3x89AB = vacc0x89AB;
    int32x4_t vacc3xCDEF = vacc0xCDEF;

    size_t k = kc;
    while (k >= 8 * sizeof(uint8_t)) {
      const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
      const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
      const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
      const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
      const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
      const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
      const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
      const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
      const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
      const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
      const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
      const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
      const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
      const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);


      const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
      const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
      const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
      const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
      const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
      const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
      const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
      const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point));

      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);

      k -= 8 * sizeof(uint8_t);
    }
    if XNN_UNLIKELY(k != 0) {
      const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
      const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
      const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
      const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
      const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
      const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
      const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k);
      const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));

      const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
      const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
      const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

      vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
      vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
      vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
      vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
      vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
      vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
      vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
      vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
      vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
      vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
      vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
      vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);

      if (k >= 2 * sizeof(uint8_t)) {
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
        const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);

        if (k > 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
          const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
          vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
          vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);

          if (k >= 4 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
            const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
            vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
            vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);

            if (k > 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
              const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
              vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
              vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);

              if (k >= 6 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
                const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
                vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
                vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);

                if (k > 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
                  const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
                  vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
                  vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
                }
              }
            }
          }
        }
      }
    }

    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
    vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
    vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
    vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
    vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
    vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
    vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
    vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
    vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
    vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
    vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
    vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
    vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
    vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
    vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
    vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
    vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
    vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
    vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
    vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
    vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
    #if XNN_ARCH_ARM64
      int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
      int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);
      int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
      int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF);
      int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
      int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF);
      int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
      int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF);

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
      vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
      vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
      vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
      vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
      vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);

      uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
      uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
      uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF);
      uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF);
    #else
      int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
      int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));
      int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
      int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF));
      int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
      int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF));
      int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
      int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF));

      vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
      vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
      vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
      vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
      vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
      vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
      vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
      vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);

      uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
      uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
      uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF));
      uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF));
    #endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
    vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
    vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min);
    vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min);

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
    vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
    vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max);
    vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
      vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
      vst1q_u8(c2 + 0, vout2x0123456789ABCDEF);
      vst1q_u8(c3 + 0, vout3x0123456789ABCDEF);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
      c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);

      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
      a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
      a3 = (const uint8_t*) ((uintptr_t) a3 - kc);

      nc -= 16;
    } else {
      uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
      uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF));
      if (nc & 8) {
        vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
        vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
        vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8;
        vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8;
        vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
        vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF));
      }
      if (nc & 4) {
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
        vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
        vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
        vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
        vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
        vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
        vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
        vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const uint8_t** restrict a,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  uint8_t* c0 = c;

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t p = ks;
    do {
      const uint8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 8 * sizeof(uint8_t)) {
        const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);


        const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);

        k -= 8 * sizeof(uint8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
        const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);

        if (k >= 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
          const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);

          if (k > 2 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
            const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);

            if (k >= 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
              const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);

              if (k > 4 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
                const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);

                if (k >= 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
                  const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);

                  if (k > 6 * sizeof(uint8_t)) {
                    const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
                    const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                    vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

    uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);

    uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
#endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const uint8_t**restrict) ((uintptr_t) a - ks);

      nc -= 16;
    } else {
      uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF);
      if (nc & 8) {
        vst1_u8(c0, vout0x01234567); c0 += 8;
        vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF);
      }
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_u8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const uint8_t** restrict a,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 1);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (1 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  uint8_t* c0 = c;

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);

    size_t p = ks;
    do {
      const uint8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
      }
      a += 1;

      size_t k = kc;
      while (k >= 8 * sizeof(uint8_t)) {
        const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);


        const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);

        k -= 8 * sizeof(uint8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);

        if (k >= 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);

          if (k > 2 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);

            if (k >= 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);

              if (k > 4 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);

                if (k >= 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);

                  if (k > 6 * sizeof(uint8_t)) {
                    const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 1 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);

    uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
#endif

    const uint8x8_t voutput_min = vld1_dup_u8(&params->rndnu_neon.output_min);
    vout0x01234567 = vmax_u8(vout0x01234567, voutput_min);

    const uint8x8_t voutput_max = vld1_dup_u8(&params->rndnu_neon.output_max);
    vout0x01234567 = vmin_u8(vout0x01234567, voutput_max);

    if (nc >= 8) {
      vst1_u8(c0 + 0, vout0x01234567);

      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const uint8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
        vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_u8(c0, vout0x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const uint8_t** restrict a,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 3);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (3 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  uint8_t* c0 = c;
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    c1 = c0;
  }
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    c2 = c1;
  }

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;
    int32x4_t vacc2x0123 = vacc0x0123;
    int32x4_t vacc2x4567 = vacc0x4567;

    size_t p = ks;
    do {
      const uint8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
      }
      const uint8_t* restrict a1 = a[1];
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
      }
      const uint8_t* restrict a2 = a[2];
      if XNN_UNPREDICTABLE(a2 != zero) {
        a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
      }
      a += 3;

      size_t k = kc;
      while (k >= 8 * sizeof(uint8_t)) {
        const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
        const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
        const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
        const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
        const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
        const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);


        const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
        const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
        const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
        const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);

        k -= 8 * sizeof(uint8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
        const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
        const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
        const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
        const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);

        if (k >= 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);

          if (k > 2 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);

            if (k >= 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);

              if (k > 4 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);

                if (k >= 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);

                  if (k > 6 * sizeof(uint8_t)) {
                    const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 3 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
    vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
    vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
    vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
    vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
    int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
    vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);

    uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
    uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
    int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
    vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);

    uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
    uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567);
#endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
    vout2x01234567 = vmax_u8(vout2x01234567, vget_low_u8(voutput_min));

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
    vout2x01234567 = vmin_u8(vout2x01234567, vget_low_u8(voutput_max));

    if (nc >= 8) {
      vst1_u8(c2 + 0, vout2x01234567);
      vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567));
      vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567));

      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const uint8_t**restrict) ((uintptr_t) a - ks);

      nc -= 8;
    } else {
      if (nc & 4) {
        vst1_lane_u32((void*) c2, vreinterpret_u32_u8(vout2x01234567), 0); c2 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
        vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 4);
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1_lane_u16((void*) c2, vreinterpret_u16_u8(vout2x01234567), 0); c2 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
        vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 2);
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1_lane_u8(c2, vout2x01234567, 0);
        vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
        vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane(
    size_t mr,
    size_t nc,
    size_t kc,
    size_t ks,
    const uint8_t** restrict a,
    const void* restrict w,
    uint8_t* restrict c,
    size_t cm_stride,
    size_t cn_stride,
    size_t a_offset,
    const uint8_t* zero,
    const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(mr != 0);
  assert(mr <= 4);
  assert(nc != 0);
  assert(kc != 0);
  assert(ks != 0);
  assert(ks % (4 * sizeof(void*)) == 0);
  assert(a_offset % sizeof(uint8_t) == 0);
  assert(a != NULL);
  assert(w != NULL);
  assert(c != NULL);

  uint8_t* c0 = c;
  uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
  if XNN_UNPREDICTABLE(mr < 2) {
    c1 = c0;
  }
  uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
  if XNN_UNPREDICTABLE(mr <= 2) {
    c2 = c1;
  }
  uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
  if XNN_UNPREDICTABLE(mr != 4) {
    c3 = c2;
  }

  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
  do {
    int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
    int32x4_t vacc1x0123 = vacc0x0123;
    int32x4_t vacc1x4567 = vacc0x4567;
    int32x4_t vacc1x89AB = vacc0x89AB;
    int32x4_t vacc1xCDEF = vacc0xCDEF;
    int32x4_t vacc2x0123 = vacc0x0123;
    int32x4_t vacc2x4567 = vacc0x4567;
    int32x4_t vacc2x89AB = vacc0x89AB;
    int32x4_t vacc2xCDEF = vacc0xCDEF;
    int32x4_t vacc3x0123 = vacc0x0123;
    int32x4_t vacc3x4567 = vacc0x4567;
    int32x4_t vacc3x89AB = vacc0x89AB;
    int32x4_t vacc3xCDEF = vacc0xCDEF;

    size_t p = ks;
    do {
      const uint8_t* restrict a0 = a[0];
      if XNN_UNPREDICTABLE(a0 != zero) {
        a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
      }
      const uint8_t* restrict a1 = a[1];
      if XNN_UNPREDICTABLE(a1 != zero) {
        a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
      }
      const uint8_t* restrict a2 = a[2];
      if XNN_UNPREDICTABLE(a2 != zero) {
        a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
      }
      const uint8_t* restrict a3 = a[3];
      if XNN_UNPREDICTABLE(a3 != zero) {
        a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
      }
      a += 4;

      size_t k = kc;
      while (k >= 8 * sizeof(uint8_t)) {
        const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
        const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
        const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
        const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
        const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
        const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
        const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
        const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
        const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
        const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
        const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
        const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
        const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
        const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);


        const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
        const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
        const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
        const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
        const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
        const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
        const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
        const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point));

        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);

        k -= 8 * sizeof(uint8_t);
      }
      if XNN_UNLIKELY(k != 0) {
        const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
        const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
        const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
        const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
        const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
        const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
        const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k);
        const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));

        const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
        const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
        const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point));

        vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
        vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
        vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
        vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
        vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
        vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
        vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
        vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
        vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
        vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
        vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
        vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);

        if (k >= 2 * sizeof(uint8_t)) {
          const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
          const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
          const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point));

          vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
          vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
          vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
          vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
          vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
          vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
          vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
          vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
          vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
          vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
          vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
          vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
          vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
          vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
          vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);

          if (k > 2 * sizeof(uint8_t)) {
            const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
            const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
            const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point));

            vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
            vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
            vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
            vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
            vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
            vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
            vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
            vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
            vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
            vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
            vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
            vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
            vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
            vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
            vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);

            if (k >= 4 * sizeof(uint8_t)) {
              const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
              const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
              const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point));

              vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
              vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
              vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
              vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
              vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
              vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
              vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
              vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
              vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
              vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
              vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
              vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
              vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
              vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
              vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);

              if (k > 4 * sizeof(uint8_t)) {
                const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
                const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point));

                vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
                vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
                vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
                vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
                vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
                vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
                vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
                vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
                vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
                vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
                vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
                vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
                vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
                vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
                vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);

                if (k >= 6 * sizeof(uint8_t)) {
                  const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
                  const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                  const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point));

                  vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
                  vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                  vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
                  vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                  vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
                  vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
                  vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
                  vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                  vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
                  vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
                  vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
                  vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
                  vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
                  vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
                  vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);

                  if (k > 6 * sizeof(uint8_t)) {
                    const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
                    const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
                    const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point));

                    vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
                    vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                    vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
                    vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                    vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
                    vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
                    vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
                    vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                    vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
                    vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
                    vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
                    vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
                    vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
                    vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
                    vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
                  }
                }
              }
            }
          }
        }
      }
      p -= 4 * sizeof(void*);
    } while (p != 0);

    // Post-accumulation work
    const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
    const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
    const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);

    vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
    vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
    vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
    vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
    vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
    vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
    vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
    vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
    vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
    vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
    vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
    vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
    vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
    vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
    vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
    vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);

    vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
    vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
    vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
    vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
    vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
    vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
    vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
    vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
    vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
    vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
    vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
    vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
    vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
    vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
    vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
    vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);

    vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
    vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
    vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
    vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
    vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
    vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
    vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
    vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
    vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
    vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
    vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
    vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
    vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
    vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
    vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
    vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);

    const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
#if XNN_ARCH_ARM64
    int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
    int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);
    int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
    int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF);
    int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
    int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF);
    int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
    int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF);

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
    vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
    vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
    vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
    vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
    vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);

    uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
    uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
    uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF);
    uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF);
#else
    int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
    int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));
    int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
    int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF));
    int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
    int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF));
    int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
    int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF));

    vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
    vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
    vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
    vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
    vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
    vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
    vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
    vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);

    uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
    uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
    uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF));
    uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF));
#endif

    const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
    vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
    vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
    vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min);
    vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min);

    const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
    vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
    vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
    vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max);
    vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max);

    if (nc >= 16) {
      vst1q_u8(c3 + 0, vout3x0123456789ABCDEF);
      vst1q_u8(c2 + 0, vout2x0123456789ABCDEF);
      vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
      vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);

      c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
      c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
      c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
      c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);

      a = (const uint8_t**restrict) ((uintptr_t) a - ks);

      nc -= 16;
    } else {
      uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF));
      uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
      if (nc & 8) {
        vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8;
        vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8;
        vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
        vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
        vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF));
        vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
      }
      if (nc & 4) {
        vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
        vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
        vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
        vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
        vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
      }
      if (nc & 2) {
        vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
        vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
        vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
        vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
        vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
        vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
      }
      if (nc & 1) {
        vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
        vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
        vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
        vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
      }

      nc = 0;
    }
  } while (nc != 0);
}

void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->neon.b_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);

  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));
    const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point));

    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);

    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
    vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
    vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);

    uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
      const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
      const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));

      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->neon.b_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);

  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8;
    const uint8x8_t vaGHIJKLMN = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vbGHIJKLMN = vld1_u8(input_b); input_b += 8;
    const uint8x8_t vaOPQRSTUV = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vbOPQRSTUV = vld1_u8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));
    const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point));
    const int16x8_t vxaGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vaGHIJKLMN, va_zero_point));
    const int16x8_t vxbGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vbGHIJKLMN, vb_zero_point));
    const int16x8_t vxaOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vaOPQRSTUV, va_zero_point));
    const int16x8_t vxbOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vbOPQRSTUV, vb_zero_point));

    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccGHIJ = vmulq_s32(vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccKLMN = vmulq_s32(vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccOPQR = vmulq_s32(vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
    int32x4_t vaccSTUV = vmulq_s32(vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);

    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
    vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
    vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
    vaccGHIJ = vmlaq_s32(vaccGHIJ, vmovl_s16(vget_low_s16(vxbGHIJKLMN)), vb_multiplier);
    vaccKLMN = vmlaq_s32(vaccKLMN, vmovl_s16(vget_high_s16(vxbGHIJKLMN)), vb_multiplier);
    vaccOPQR = vmlaq_s32(vaccOPQR, vmovl_s16(vget_low_s16(vxbOPQRSTUV)), vb_multiplier);
    vaccSTUV = vmlaq_s32(vaccSTUV, vmovl_s16(vget_high_s16(vxbOPQRSTUV)), vb_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
    vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
    vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
    vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
    vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
    const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
    const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);

    uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
    uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
    voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
    voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
    vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
      const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
      const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));

      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);

  const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
  const int32_t vb = params->neon.b_multiplier;
  const int32x4_t vbias = vdupq_n_s32(vxb * vb);

  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));

    int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);

    uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));

      int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);

  const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
  const int32_t vb = params->neon.b_multiplier;
  const int32x4_t vbias = vdupq_n_s32(vxb * vb);

  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vaGHIJKLMN = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vaOPQRSTUV = vld1_u8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));
    const int16x8_t vxaGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vaGHIJKLMN, va_zero_point));
    const int16x8_t vxaOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vaOPQRSTUV, va_zero_point));

    int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
    int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
    int32x4_t vaccGHIJ = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccKLMN = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
    int32x4_t vaccOPQR = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
    int32x4_t vaccSTUV = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);

    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
    vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
    vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
    vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
    vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);

    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
    const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
    const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);

    uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
    uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV));

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
    voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
    voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
    vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));

      int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
      int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);

      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
      vacc4567 = vrshlq_s32(vacc4567, vright_shift);

      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));

      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_vcvt_ukernel__neon_x32(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vinput_zero_point = vld1q_dup_u16(&params->neon.input_zero_point);
  const int16x8_t vmultiplier = vld1q_dup_s16(&params->neon.multiplier);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x16_t vx0 = vld1q_u8(input); input += 16;
    const uint8x16_t vx1 = vld1q_u8(input); input += 16;

    int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0)));
    int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0)));
    int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1)));
    int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1)));

    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);

    vacc0 = vqrdmulhq_s16(vacc0, vmultiplier);
    vacc1 = vqrdmulhq_s16(vacc1, vmultiplier);
    vacc2 = vqrdmulhq_s16(vacc2, vmultiplier);
    vacc3 = vqrdmulhq_s16(vacc3, vmultiplier);

    vacc0 = vqaddq_s16(vacc0, voutput_zero_point);
    vacc1 = vqaddq_s16(vacc1, voutput_zero_point);
    vacc2 = vqaddq_s16(vacc2, voutput_zero_point);
    vacc3 = vqaddq_s16(vacc3, voutput_zero_point);

    const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vacc0), vqmovun_s16(vacc1));
    const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vacc2), vqmovun_s16(vacc3));

    vst1q_u8(output, vy0); output += 16;
    vst1q_u8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    const uint8x8_t vy = vqmovun_s16(vacc);
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    uint8x8_t vy = vqmovun_s16(vacc);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

void xnn_qu8_vhswish_ukernel__neon_x16(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vinput_zero_point = vld1q_dup_u16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
    const uint8x16_t vx0 = vld1q_u8(input); input += 16;
    int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0)));
    int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0)));
    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa);
    int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa);
    vin0 = vqshlq_s16(vin0, vinput_scale_div_exp);
    vin1 = vqshlq_s16(vin1, vinput_scale_div_exp);
    vin0 = vqsubq_s16(vin0, vhalf);
    vin1 = vqsubq_s16(vin1, vhalf);
    vin0 = vminq_s16(vin0, vzero);
    vin1 = vminq_s16(vin1, vzero);
    int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio);
    int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio);
    vout0 = vqdmulhq_s16(vout0, vin0);
    vout1 = vqdmulhq_s16(vout1, vin1);
    vout0 = vqaddq_s16(vout0, voutput_zero_point);
    vout1 = vqaddq_s16(vout1, voutput_zero_point);
    const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vout0), vqmovun_s16(vout1));
    vst1q_u8(output, vy0); output += 16;
  }
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const uint8x8_t vy = vqmovun_s16(vout);
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    uint8x8_t vy = vqmovun_s16(vout);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

void xnn_qu8_vhswish_ukernel__neon_x32(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vinput_zero_point = vld1q_dup_u16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x16_t vx0 = vld1q_u8(input); input += 16;
    const uint8x16_t vx1 = vld1q_u8(input); input += 16;
    int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0)));
    int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0)));
    int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1)));
    int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1)));
    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);
    int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa);
    int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa);
    int16x8_t vin2 = vqdmulhq_s16(vacc2, vinput_scale_div_mantissa);
    int16x8_t vin3 = vqdmulhq_s16(vacc3, vinput_scale_div_mantissa);
    vin0 = vqshlq_s16(vin0, vinput_scale_div_exp);
    vin1 = vqshlq_s16(vin1, vinput_scale_div_exp);
    vin2 = vqshlq_s16(vin2, vinput_scale_div_exp);
    vin3 = vqshlq_s16(vin3, vinput_scale_div_exp);
    vin0 = vqsubq_s16(vin0, vhalf);
    vin1 = vqsubq_s16(vin1, vhalf);
    vin2 = vqsubq_s16(vin2, vhalf);
    vin3 = vqsubq_s16(vin3, vhalf);
    vin0 = vminq_s16(vin0, vzero);
    vin1 = vminq_s16(vin1, vzero);
    vin2 = vminq_s16(vin2, vzero);
    vin3 = vminq_s16(vin3, vzero);
    int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio);
    int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio);
    int16x8_t vout2 = vqdmulhq_s16(vacc2, vscale_ratio);
    int16x8_t vout3 = vqdmulhq_s16(vacc3, vscale_ratio);
    vout0 = vqdmulhq_s16(vout0, vin0);
    vout1 = vqdmulhq_s16(vout1, vin1);
    vout2 = vqdmulhq_s16(vout2, vin2);
    vout3 = vqdmulhq_s16(vout3, vin3);
    vout0 = vqaddq_s16(vout0, voutput_zero_point);
    vout1 = vqaddq_s16(vout1, voutput_zero_point);
    vout2 = vqaddq_s16(vout2, voutput_zero_point);
    vout3 = vqaddq_s16(vout3, voutput_zero_point);
    const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vout0), vqmovun_s16(vout1));
    const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vout2), vqmovun_s16(vout3));
    vst1q_u8(output, vy0); output += 16;
    vst1q_u8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const uint8x8_t vy = vqmovun_s16(vout);
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    uint8x8_t vy = vqmovun_s16(vout);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

void xnn_qu8_vhswish_ukernel__neon_x8(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vinput_zero_point = vld1q_dup_u16(&params->neon.input_zero_point);
  const int16x8_t vinput_scale_div_exp = 	vld1q_dup_s16(&params->neon.input_scale_div_exp);
  const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(&params->neon.input_scale_div_mantissa);
  const int16x8_t vscale_ratio = vld1q_dup_s16(&params->neon.scale_ratio);
  const int16x8_t vhalf = vdupq_n_s16(16384);
  const int16x8_t vzero = vdupq_n_s16(0);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    const uint8x8_t vy = vqmovun_s16(vout);
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    vacc = vshlq_n_s16(vacc, 7);
    int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa);
    vin = vqshlq_s16(vin, vinput_scale_div_exp);
    vin = vqsubq_s16(vin, vhalf);
    vin = vminq_s16(vin, vzero);
    int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio);
    vout = vqdmulhq_s16(vout, vin);
    vout = vqaddq_s16(vout, voutput_zero_point);
    uint8x8_t vy = vqmovun_s16(vout);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

void xnn_qu8_vlrelu_ukernel__neon_x32(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint16x8_t vinput_zero_point = vld1q_dup_u16(&params->neon.input_zero_point);
  const int16x8_t vpositive_multiplier = vld1q_dup_s16(&params->neon.positive_multiplier);
  const int16x8_t vnegative_multiplier = vld1q_dup_s16(&params->neon.negative_multiplier);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
  for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) {
    const uint8x16_t vx0 = vld1q_u8(input); input += 16;
    const uint8x16_t vx1 = vld1q_u8(input); input += 16;

    int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0)));
    int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0)));
    int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1)));
    int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1)));

    const uint16x8_t vmask0 = vcltq_s16(vacc0, vmovq_n_s16(0));
    const uint16x8_t vmask1 = vcltq_s16(vacc1, vmovq_n_s16(0));
    const uint16x8_t vmask2 = vcltq_s16(vacc2, vmovq_n_s16(0));
    const uint16x8_t vmask3 = vcltq_s16(vacc3, vmovq_n_s16(0));

    vacc0 = vshlq_n_s16(vacc0, 7);
    vacc1 = vshlq_n_s16(vacc1, 7);
    vacc2 = vshlq_n_s16(vacc2, 7);
    vacc3 = vshlq_n_s16(vacc3, 7);

    const int16x8_t vmultiplier0 = vbslq_s16(vmask0, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier1 = vbslq_s16(vmask1, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier2 = vbslq_s16(vmask2, vpositive_multiplier, vnegative_multiplier);
    const int16x8_t vmultiplier3 = vbslq_s16(vmask3, vpositive_multiplier, vnegative_multiplier);

    vacc0 = vqrdmulhq_s16(vacc0, vmultiplier0);
    vacc1 = vqrdmulhq_s16(vacc1, vmultiplier1);
    vacc2 = vqrdmulhq_s16(vacc2, vmultiplier2);
    vacc3 = vqrdmulhq_s16(vacc3, vmultiplier3);

    vacc0 = vqaddq_s16(vacc0, voutput_zero_point);
    vacc1 = vqaddq_s16(vacc1, voutput_zero_point);
    vacc2 = vqaddq_s16(vacc2, voutput_zero_point);
    vacc3 = vqaddq_s16(vacc3, voutput_zero_point);

    const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vacc0), vqmovun_s16(vacc1));
    const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vacc2), vqmovun_s16(vacc3));

    vst1q_u8(output, vy0); output += 16;
    vst1q_u8(output, vy1); output += 16;
  }
  for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) {
    const uint8x8_t vx = vld1_u8(input); input += 8;
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0));
    vacc = vshlq_n_s16(vacc, 7);
    const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    const uint8x8_t vy = vqmovun_s16(vacc);
    vst1_u8(output, vy); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    assert(batch >= 1 * sizeof(uint8_t));
    assert(batch <= 7 * sizeof(uint8_t));

    const uint8x8_t vx = vld1_u8(input);
    int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx));
    const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0));
    vacc = vshlq_n_s16(vacc, 7);
    const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier);
    vacc = vqrdmulhq_s16(vacc, vmultiplier);
    vacc = vqaddq_s16(vacc, voutput_zero_point);
    uint8x8_t vy = vqmovun_s16(vacc);

    if (batch & (4 * sizeof(uint8_t))) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4;
      vy = vext_u8(vy, vy, 4);
    }
    if (batch & (2 * sizeof(uint8_t))) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2;
      vy = vext_u8(vy, vy, 2);
    }
    if (batch & (1 * sizeof(uint8_t))) {
      vst1_lane_u8(output, vy, 0);
    }
  }
}

void xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(params->rndnu_neon.a_zero_point);
  const uint8x8_t vb_zero_point = vld1_dup_u8(params->rndnu_neon.b_zero_point);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);

  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;
    const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));
    const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point));

    int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567));
    int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567));
    int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb89ABCDEF));
    int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb89ABCDEF));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
    vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift);
    vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
    vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
    vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
    #else
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    #endif

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
    vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

    #if XNN_ARCH_ARM64
      uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
    #else
      uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
    #endif

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
      const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
      const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));

      int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567));
      int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);

      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16(
    size_t batch,
    const uint8_t* input_a,
    const uint8_t* input_b,
    uint8_t* output,
    const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input_a != NULL);
  assert(input_b != NULL);
  assert(output != NULL);

  const uint8x8_t va_zero_point = vld1_dup_u8(params->rndnu_neon.a_zero_point);
  const int32x4_t vleft_pre_shift = vld1q_dup_s32(&params->rndnu_neon.left_pre_shift);
  const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
  const int32x4_t vleft_post_shift = vld1q_dup_s32(&params->rndnu_neon.left_post_shift);
  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
  const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);

  const uint8x8_t vb = vld1_dup_u8(input_b);
  const uint8x8_t vb_zero_point = vld1_dup_u8(params->rndnu_neon.b_zero_point);
  const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point));
  for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) {
    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;

    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));

    int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb));
    int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb));
    int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb));
    int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb));

    vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
    vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);
    vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift);
    vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift);

    vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
    vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
    vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
    vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);

    vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
    vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);
    vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift);
    vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift);

    #if XNN_ARCH_ARM64
      int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
    #else
      int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
    #endif

    vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
    vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);

    #if XNN_ARCH_ARM64
      uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
    #else
      uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
    #endif

    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);

    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);

    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
  }
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;

      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));

      int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb));
      int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb));

      vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift);
      vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift);

      vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
      vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);

      vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift);
      vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift);

      #if XNN_ARCH_ARM64
        int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
      #else
        int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
      #endif

      vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);

      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);

      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
      if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) {
        vst1_u8(output, vout01234567); output += 8;
        batch -= 8 * sizeof(uint8_t);
      } else {
        if (batch & (4 * sizeof(uint8_t))) {
          vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
        }
        if (batch & (2 * sizeof(uint8_t))) {
          vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
        }
        if (batch & (1 * sizeof(uint8_t))) {
          vst1_lane_u8(output, vout01234567, 0);
        }
        batch = 0;
      }
    } while (batch != 0);
  }
}

void xnn_s8_ibilinear_ukernel__neon_c16(
    size_t output_pixels,
    size_t channels,
    const int8_t** restrict input,
    size_t input_offset,
    const int16_t* restrict weights,
    int8_t* restrict output,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);

  do {
    const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
    const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
    const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
    const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
    input += 4;

    #if XNN_ARCH_ARM64
      const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1;
    #else
      const int16x4_t valphah = vld1_dup_s16(weights); weights += 1;
    #endif
    const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1;

    size_t c = channels;
    for (; c >= 16 * sizeof(int8_t); c -= 16 * sizeof(int8_t)) {
      const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8;
      const int8x8_t vtl89ABCDEF = vld1_s8(i0); i0 += 8;
      const int8x8_t vtr89ABCDEF = vld1_s8(i1); i1 += 8;
      const int8x8_t vbl89ABCDEF = vld1_s8(i2); i2 += 8;
      const int8x8_t vbr89ABCDEF = vld1_s8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567);
      const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567);
      const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567);
      const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567);
      const int16x8_t vtd89ABCDEF = vsubl_s8(vtr89ABCDEF, vtl89ABCDEF);
      const int16x8_t vbd89ABCDEF = vsubl_s8(vbr89ABCDEF, vbl89ABCDEF);
      const int16x8_t vdl89ABCDEF = vsubl_s8(vbl89ABCDEF, vtl89ABCDEF);
      const int16x8_t vxtl89ABCDEF = vmovl_s8(vtl89ABCDEF);

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);
      const int16x8_t vdd89ABCDEF = vsubq_s16(vbd89ABCDEF, vtd89ABCDEF);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);
        const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), vget_low_s16(valphah));
        const int32x4_t vtCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vtd89ABCDEF, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
        const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), vget_low_s16(valphah));
        const int32x4_t vdCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vdd89ABCDEF, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);
        const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), valphah);
        const int32x4_t vtCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vget_high_s16(vtd89ABCDEF), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
        const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), valphah);
        const int32x4_t vdCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vget_high_s16(vdd89ABCDEF), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);
      const int32x4_t vacc89AB = vmlaq_s32(vshlq_n_s32(vt89AB, 11), vd89AB, valphav);
      const int32x4_t vaccCDEF = vmlaq_s32(vshlq_n_s32(vtCDEF, 11), vdCDEF, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
        const int16x8_t vacc89ABCDEF = vuzp2q_s16(vreinterpretq_s16_s32(vacc89AB), vreinterpretq_s16_s32(vaccCDEF));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
        const int16x8_t vacc89ABCDEF = vcombine_s16(vshrn_n_s32(vacc89AB, 16), vshrn_n_s32(vaccCDEF, 16));
      #endif  // !XNN_ARCH_ARM64

      const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6);
      const int8x8_t vo89ABCDEF = vrshrn_n_s16(vacc89ABCDEF, 6);

      vst1_s8(output, vo01234567); output += 8;
      vst1_s8(output, vo89ABCDEF); output += 8;
    }
    for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) {
      const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567);
      const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567);
      const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567);
      const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567);

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6);

      vst1_s8(output, vo01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8x8_t vtl01234567 = vld1_s8(i0);
      const int8x8_t vtr01234567 = vld1_s8(i1);
      const int8x8_t vbl01234567 = vld1_s8(i2);
      const int8x8_t vbr01234567 = vld1_s8(i3);

      const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567);
      const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567);
      const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567);
      const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567);

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6);

      if (c & (4 * sizeof(int8_t))) {
        vst1_lane_u32((void*) output, vreinterpret_u32_s8(vo01234567), 0); output += 4;
        vo01234567 = vext_s8(vo01234567, vo01234567, 4);
      }
      if (c & (2 * sizeof(int8_t))) {
        vst1_lane_u16((void*) output, vreinterpret_u16_s8(vo01234567), 0); output += 2;
        vo01234567 = vext_s8(vo01234567, vo01234567, 2);
      }
      if (c & (1 * sizeof(int8_t))) {
        vst1_lane_s8(output, vo01234567, 0); output += 1;
      }
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_s8_ibilinear_ukernel__neon_c8(
    size_t output_pixels,
    size_t channels,
    const int8_t** restrict input,
    size_t input_offset,
    const int16_t* restrict weights,
    int8_t* restrict output,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);

  do {
    const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
    const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
    const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
    const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
    input += 4;

    #if XNN_ARCH_ARM64
      const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1;
    #else
      const int16x4_t valphah = vld1_dup_s16(weights); weights += 1;
    #endif
    const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1;

    size_t c = channels;
    for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) {
      const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8;
      const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8;
      const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8;
      const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567);
      const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567);
      const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567);
      const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567);

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6);

      vst1_s8(output, vo01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      const int8x8_t vtl01234567 = vld1_s8(i0);
      const int8x8_t vtr01234567 = vld1_s8(i1);
      const int8x8_t vbl01234567 = vld1_s8(i2);
      const int8x8_t vbr01234567 = vld1_s8(i3);

      const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567);
      const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567);
      const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567);
      const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567);

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6);

      if (c & (4 * sizeof(int8_t))) {
        vst1_lane_u32((void*) output, vreinterpret_u32_s8(vo01234567), 0); output += 4;
        vo01234567 = vext_s8(vo01234567, vo01234567, 4);
      }
      if (c & (2 * sizeof(int8_t))) {
        vst1_lane_u16((void*) output, vreinterpret_u16_s8(vo01234567), 0); output += 2;
        vo01234567 = vext_s8(vo01234567, vo01234567, 2);
      }
      if (c & (1 * sizeof(int8_t))) {
        vst1_lane_s8(output, vo01234567, 0); output += 1;
      }
    }

    output = (int8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const int8_t** input,
    size_t input_offset,
    int8_t* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(channels != 0);

  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.max);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.min);
  do {
    int8_t* o = output;
    {
      const int8_t* i0 = *input++;
      const int8_t* i1 = *input++;
      const int8_t* i2 = *input++;
      const int8_t* i3 = *input++;
      const int8_t* i4 = *input++;
      const int8_t* i5 = *input++;
      const int8_t* i6 = *input++;
      const int8_t* i7 = *input++;
      const int8_t* i8 = *input++;
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
      i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
      if (kernel_elements < 2) {
        i1 = i0;
      }
      if (kernel_elements <= 2) {
        i2 = i0;
      }
      if (kernel_elements < 4) {
        i3 = i0;
      }
      if (kernel_elements <= 4) {
        i4 = i0;
      }
      if (kernel_elements < 6) {
        i5 = i0;
      }
      if (kernel_elements <= 6) {
        i6 = i0;
      }
      if (kernel_elements < 8) {
        i7 = i0;
      }
      if (kernel_elements <= 8) {
        i8 = i0;
      }

      size_t c = channels;
      for (; c >= 16; c -= 16) {
        const int8x16_t vi0 = vld1q_s8(i0); i0 += 16;
        const int8x16_t vi1 = vld1q_s8(i1); i1 += 16;
        const int8x16_t vi2 = vld1q_s8(i2); i2 += 16;
        const int8x16_t vi3 = vld1q_s8(i3); i3 += 16;
        const int8x16_t vi4 = vld1q_s8(i4); i4 += 16;
        const int8x16_t vi5 = vld1q_s8(i5); i5 += 16;
        const int8x16_t vi6 = vld1q_s8(i6); i6 += 16;
        const int8x16_t vi7 = vld1q_s8(i7); i7 += 16;
        const int8x16_t vi8 = vld1q_s8(i8); i8 += 16;

        const int8x16_t vmax018 = vmaxq_s8(vmaxq_s8(vi0, vi1), vi8);
        const int8x16_t vmax23 = vmaxq_s8(vi2, vi3);
        const int8x16_t vmax45 = vmaxq_s8(vi4, vi5);
        const int8x16_t vmax67 = vmaxq_s8(vi6, vi7);

        const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45);
        const int8x16_t vmax01678 = vmaxq_s8(vmax018, vmax67);
        int8x16_t vout = vmaxq_s8(vmax2345, vmax01678);
        vout = vmaxq_s8(vout, voutput_min);
        vout = vminq_s8(vout, voutput_max);

        vst1q_s8(o, vout); o += 16;
      }
      if (c != 0) {
        const int8x16_t vi0 = vld1q_s8(i0);
        const int8x16_t vi1 = vld1q_s8(i1);
        const int8x16_t vi2 = vld1q_s8(i2);
        const int8x16_t vi3 = vld1q_s8(i3);
        const int8x16_t vi4 = vld1q_s8(i4);
        const int8x16_t vi5 = vld1q_s8(i5);
        const int8x16_t vi6 = vld1q_s8(i6);
        const int8x16_t vi7 = vld1q_s8(i7);
        const int8x16_t vi8 = vld1q_s8(i8);

        const int8x16_t vmax018 = vmaxq_s8(vmaxq_s8(vi0, vi1), vi8);
        const int8x16_t vmax23 = vmaxq_s8(vi2, vi3);
        const int8x16_t vmax45 = vmaxq_s8(vi4, vi5);
        const int8x16_t vmax67 = vmaxq_s8(vi6, vi7);

        const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45);
        const int8x16_t vmax01678 = vmaxq_s8(vmax018, vmax67);
        int8x16_t vout = vmaxq_s8(vmax2345, vmax01678);
        vout = vmaxq_s8(vout, voutput_min);
        vout = vminq_s8(vout, voutput_max);

        int8x8_t vout_lo = vget_low_s8(vout);
        if (c & 8) {
          vst1_s8(o, vout_lo); o += 8;
          vout_lo = vget_high_s8(vout);
        }
        if (c & 4) {
          vst1_lane_u32((void*) o, vreinterpret_u32_s8(vout_lo), 0); o += 4;
          vout_lo = vext_s8(vout_lo, vout_lo, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) o, vreinterpret_u16_s8(vout_lo), 0); o += 2;
          vout_lo = vext_s8(vout_lo, vout_lo, 2);
        }
        if (c & 1) {
          vst1_lane_s8(o, vout_lo, 0); o += 1;
        }
      }
    }

    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
      const int8_t* i0 = *input++;
      const int8_t* i1 = *input++;
      const int8_t* i2 = *input++;
      const int8_t* i3 = *input++;
      const int8_t* i4 = *input++;
      const int8_t* i5 = *input++;
      const int8_t* i6 = *input++;
      const int8_t* i7 = *input++;
      i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
      i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
      i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
      i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
      i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
      i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
      i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
      i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
      if (k < 2) {
        i1 = i0;
      }
      if (k <= 2) {
        i2 = i0;
      }
      if (k < 4) {
        i3 = i0;
      }
      if (k <= 4) {
        i4 = i0;
      }
      if (k < 6) {
        i5 = i0;
      }
      if (k <= 6) {
        i6 = i0;
      }
      if (k < 8) {
        i7 = i0;
      }

      o = output;
      size_t c = channels;
      for (; c >= 16; c -= 16) {
        const int8x16_t vi0 = vld1q_s8(i0); i0 += 16;
        const int8x16_t vi1 = vld1q_s8(i1); i1 += 16;
        const int8x16_t vi2 = vld1q_s8(i2); i2 += 16;
        const int8x16_t vi3 = vld1q_s8(i3); i3 += 16;
        const int8x16_t vi4 = vld1q_s8(i4); i4 += 16;
        const int8x16_t vi5 = vld1q_s8(i5); i5 += 16;
        const int8x16_t vi6 = vld1q_s8(i6); i6 += 16;
        const int8x16_t vi7 = vld1q_s8(i7); i7 += 16;
        const int8x16_t vo = vld1q_s8(o);

        const int8x16_t vmax01 = vmaxq_s8(vmaxq_s8(vi0, vi1), vo);
        const int8x16_t vmax23 = vmaxq_s8(vi2, vi3);
        const int8x16_t vmax45 = vmaxq_s8(vi4, vi5);
        const int8x16_t vmax67 = vmaxq_s8(vi6, vi7);

        const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45);
        const int8x16_t vmax0167 = vmaxq_s8(vmax01, vmax67);
        int8x16_t vout = vmaxq_s8(vmax2345, vmax0167);
        vout = vmaxq_s8(vout, voutput_min);
        vout = vminq_s8(vout, voutput_max);

        vst1q_s8(o, vout); o += 16;
      }
      if (c != 0) {
        const int8x16_t vi0 = vld1q_s8(i0);
        const int8x16_t vi1 = vld1q_s8(i1);
        const int8x16_t vi2 = vld1q_s8(i2);
        const int8x16_t vi3 = vld1q_s8(i3);
        const int8x16_t vi4 = vld1q_s8(i4);
        const int8x16_t vi5 = vld1q_s8(i5);
        const int8x16_t vi6 = vld1q_s8(i6);
        const int8x16_t vi7 = vld1q_s8(i7);
        const int8x16_t vo = vld1q_s8(o);

        const int8x16_t vmax01 = vmaxq_s8(vmaxq_s8(vi0, vi1), vo);
        const int8x16_t vmax23 = vmaxq_s8(vi2, vi3);
        const int8x16_t vmax45 = vmaxq_s8(vi4, vi5);
        const int8x16_t vmax67 = vmaxq_s8(vi6, vi7);

        const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45);
        const int8x16_t vmax0167 = vmaxq_s8(vmax01, vmax67);
        int8x16_t vout = vmaxq_s8(vmax2345, vmax0167);
        vout = vmaxq_s8(vout, voutput_min);
        vout = vminq_s8(vout, voutput_max);

        int8x8_t vout_lo = vget_low_s8(vout);
        if (c & 8) {
          vst1_s8(o, vout_lo); o += 8;
          vout_lo = vget_high_s8(vout);
        }
        if (c & 4) {
          vst1_lane_u32((void*) o, vreinterpret_u32_s8(vout_lo), 0); o += 4;
          vout_lo = vext_s8(vout_lo, vout_lo, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) o, vreinterpret_u16_s8(vout_lo), 0); o += 2;
          vout_lo = vext_s8(vout_lo, vout_lo, 2);
        }
        if (c & 1) {
          vst1_lane_s8(o, vout_lo, 0); o += 1;
        }
      }
    }
    input = (const int8_t**) ((uintptr_t) input + input_increment);
    output = (int8_t*) ((uintptr_t) o + output_increment);
  } while (--output_pixels != 0);
}

void xnn_s8_vclamp_ukernel__neon_x64(
    size_t batch,
    const int8_t* input,
    int8_t* output,
    const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(int8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.max);
  const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.min);

  for (; batch >= 64; batch -= 64) {
    int8x16_t vacc0 = vld1q_s8(input); input += 16;
    int8x16_t vacc1 = vld1q_s8(input); input += 16;
    int8x16_t vacc2 = vld1q_s8(input); input += 16;
    int8x16_t vacc3 = vld1q_s8(input); input += 16;

    vacc0 = vmaxq_s8(vacc0, voutput_min);
    vacc1 = vmaxq_s8(vacc1, voutput_min);
    vacc2 = vmaxq_s8(vacc2, voutput_min);
    vacc3 = vmaxq_s8(vacc3, voutput_min);

    vacc0 = vminq_s8(vacc0, voutput_max);
    vacc1 = vminq_s8(vacc1, voutput_max);
    vacc2 = vminq_s8(vacc2, voutput_max);
    vacc3 = vminq_s8(vacc3, voutput_max);

    vst1q_s8(output, vacc0); output += 16;
    vst1q_s8(output, vacc1); output += 16;
    vst1q_s8(output, vacc2); output += 16;
    vst1q_s8(output, vacc3); output += 16;
  }
  for (; batch >= 8; batch -= 8) {
    int8x8_t vacc = vld1_s8(input); input += 8;

    vacc = vmin_s8(vacc, vget_low_s8(voutput_max));
    vacc = vmax_s8(vacc, vget_low_s8(voutput_min));

    vst1_s8(output, vacc); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    int8x8_t vacc = vld1_s8(input); input += 8;

    vacc = vmin_s8(vacc, vget_low_s8(voutput_max));
    vacc = vmax_s8(vacc, vget_low_s8(voutput_min));

    if (batch & 4) {
      vst1_lane_u32((void*) output, vreinterpret_u32_s8(vacc), 0); output += 4;
      vacc = vext_s8(vacc, vacc, 4);
    }
    if (batch & 2) {
      vst1_lane_u16((void*) output, vreinterpret_u16_s8(vacc), 0); output += 2;
      vacc = vext_s8(vacc, vacc, 2);
    }
    if (batch & 1) {
      vst1_lane_s8(output, vacc, 0);
    }
  }
}

void xnn_u8_ibilinear_ukernel__neon_c16(
    size_t output_pixels,
    size_t channels,
    const uint8_t** restrict input,
    size_t input_offset,
    const int16_t* restrict weights,
    uint8_t* restrict output,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);

  do {
    const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
    const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
    const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
    const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
    input += 4;

    #if XNN_ARCH_ARM64
      const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1;
    #else
      const int16x4_t valphah = vld1_dup_s16(weights); weights += 1;
    #endif
    const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1;

    size_t c = channels;
    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
      const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8;
      const uint8x8_t vtl89ABCDEF = vld1_u8(i0); i0 += 8;
      const uint8x8_t vtr89ABCDEF = vld1_u8(i1); i1 += 8;
      const uint8x8_t vbl89ABCDEF = vld1_u8(i2); i2 += 8;
      const uint8x8_t vbr89ABCDEF = vld1_u8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567));
      const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567));
      const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567));
      const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567));
      const int16x8_t vtd89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vtr89ABCDEF, vtl89ABCDEF));
      const int16x8_t vbd89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vbr89ABCDEF, vbl89ABCDEF));
      const int16x8_t vdl89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vbl89ABCDEF, vtl89ABCDEF));
      const int16x8_t vxtl89ABCDEF = vreinterpretq_s16_u16(vmovl_u8(vtl89ABCDEF));

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);
      const int16x8_t vdd89ABCDEF = vsubq_s16(vbd89ABCDEF, vtd89ABCDEF);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);
        const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), vget_low_s16(valphah));
        const int32x4_t vtCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vtd89ABCDEF, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
        const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), vget_low_s16(valphah));
        const int32x4_t vdCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vdd89ABCDEF, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);
        const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), valphah);
        const int32x4_t vtCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vget_high_s16(vtd89ABCDEF), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
        const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), valphah);
        const int32x4_t vdCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vget_high_s16(vdd89ABCDEF), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);
      const int32x4_t vacc89AB = vmlaq_s32(vshlq_n_s32(vt89AB, 11), vd89AB, valphav);
      const int32x4_t vaccCDEF = vmlaq_s32(vshlq_n_s32(vtCDEF, 11), vdCDEF, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
        const int16x8_t vacc89ABCDEF = vuzp2q_s16(vreinterpretq_s16_s32(vacc89AB), vreinterpretq_s16_s32(vaccCDEF));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
        const int16x8_t vacc89ABCDEF = vcombine_s16(vshrn_n_s32(vacc89AB, 16), vshrn_n_s32(vaccCDEF, 16));
      #endif  // !XNN_ARCH_ARM64

      const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6);
      const uint8x8_t vo89ABCDEF = vrshrn_n_u16(vreinterpretq_u16_s16(vacc89ABCDEF), 6);

      vst1_u8(output, vo01234567); output += 8;
      vst1_u8(output, vo89ABCDEF); output += 8;
    }
    for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) {
      const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567));
      const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567));
      const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567));
      const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567));

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6);

      vst1_u8(output, vo01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      const uint8x8_t vtl01234567 = vld1_u8(i0);
      const uint8x8_t vtr01234567 = vld1_u8(i1);
      const uint8x8_t vbl01234567 = vld1_u8(i2);
      const uint8x8_t vbr01234567 = vld1_u8(i3);

      const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567));
      const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567));
      const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567));
      const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567));

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6);

      if (c & (4 * sizeof(uint8_t))) {
        vst1_lane_u32((void*) output, vreinterpret_u32_u8(vo01234567), 0); output += 4;
        vo01234567 = vext_u8(vo01234567, vo01234567, 4);
      }
      if (c & (2 * sizeof(uint8_t))) {
        vst1_lane_u16((void*) output, vreinterpret_u16_u8(vo01234567), 0); output += 2;
        vo01234567 = vext_u8(vo01234567, vo01234567, 2);
      }
      if (c & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vo01234567, 0); output += 1;
      }
    }

    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_u8_ibilinear_ukernel__neon_c8(
    size_t output_pixels,
    size_t channels,
    const uint8_t** restrict input,
    size_t input_offset,
    const int16_t* restrict weights,
    uint8_t* restrict output,
    size_t output_increment) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(channels != 0);

  do {
    const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
    const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
    const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
    const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
    input += 4;

    #if XNN_ARCH_ARM64
      const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1;
    #else
      const int16x4_t valphah = vld1_dup_s16(weights); weights += 1;
    #endif
    const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1;

    size_t c = channels;
    for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) {
      const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8;
      const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8;
      const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8;
      const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8;

      const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567));
      const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567));
      const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567));
      const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567));

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6);

      vst1_u8(output, vo01234567); output += 8;
    }
    if XNN_UNLIKELY(c != 0) {
      const uint8x8_t vtl01234567 = vld1_u8(i0);
      const uint8x8_t vtr01234567 = vld1_u8(i1);
      const uint8x8_t vbl01234567 = vld1_u8(i2);
      const uint8x8_t vbr01234567 = vld1_u8(i3);

      const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567));
      const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567));
      const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567));
      const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567));

      const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567);

      #if XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah));
        const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah));
        const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah);
      #else  // !XNN_ARCH_ARM64
        const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah);
        const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah);

        const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah);
        const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah);
      #endif  // !XNN_ARCH_ARM64

      const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav);
      const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav);

      #if XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567));
      #else  // !XNN_ARCH_ARM64
        const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16));
      #endif  // !XNN_ARCH_ARM64

      uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6);

      if (c & (4 * sizeof(uint8_t))) {
        vst1_lane_u32((void*) output, vreinterpret_u32_u8(vo01234567), 0); output += 4;
        vo01234567 = vext_u8(vo01234567, vo01234567, 4);
      }
      if (c & (2 * sizeof(uint8_t))) {
        vst1_lane_u16((void*) output, vreinterpret_u16_u8(vo01234567), 0); output += 2;
        vo01234567 = vext_u8(vo01234567, vo01234567, 2);
      }
      if (c & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vo01234567, 0); output += 1;
      }
    }

    output = (uint8_t*) ((uintptr_t) output + output_increment);
  } while (--output_pixels != 0);
}

void xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16(
    size_t output_pixels,
    size_t kernel_elements,
    size_t channels,
    const uint8_t** input,
    size_t input_offset,
    uint8_t* output,
    size_t input_increment,
    size_t output_increment,
    const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_pixels != 0);
  assert(kernel_elements != 0);
  assert(channels != 0);

  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.max);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.min);
  do {
    uint8_t* o = output;
    {
      const uint8_t* i0 = *input++;
      const uint8_t* i1 = *input++;
      const uint8_t* i2 = *input++;
      const uint8_t* i3 = *input++;
      const uint8_t* i4 = *input++;
      const uint8_t* i5 = *input++;
      const uint8_t* i6 = *input++;
      const uint8_t* i7 = *input++;
      const uint8_t* i8 = *input++;
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
      if (kernel_elements < 2) {
        i1 = i0;
      }
      if (kernel_elements <= 2) {
        i2 = i0;
      }
      if (kernel_elements < 4) {
        i3 = i0;
      }
      if (kernel_elements <= 4) {
        i4 = i0;
      }
      if (kernel_elements < 6) {
        i5 = i0;
      }
      if (kernel_elements <= 6) {
        i6 = i0;
      }
      if (kernel_elements < 8) {
        i7 = i0;
      }
      if (kernel_elements <= 8) {
        i8 = i0;
      }

      size_t c = channels;
      for (; c >= 16; c -= 16) {
        const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
        const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
        const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
        const uint8x16_t vi3 = vld1q_u8(i3); i3 += 16;
        const uint8x16_t vi4 = vld1q_u8(i4); i4 += 16;
        const uint8x16_t vi5 = vld1q_u8(i5); i5 += 16;
        const uint8x16_t vi6 = vld1q_u8(i6); i6 += 16;
        const uint8x16_t vi7 = vld1q_u8(i7); i7 += 16;
        const uint8x16_t vi8 = vld1q_u8(i8); i8 += 16;

        const uint8x16_t vmax018 = vmaxq_u8(vmaxq_u8(vi0, vi1), vi8);
        const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3);
        const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5);
        const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7);

        const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45);
        const uint8x16_t vmax01678 = vmaxq_u8(vmax018, vmax67);
        const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax01678);
        const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);

        vst1q_u8(o, vout); o += 16;
      }
      if (c != 0) {
        const uint8x16_t vi0 = vld1q_u8(i0);
        const uint8x16_t vi1 = vld1q_u8(i1);
        const uint8x16_t vi2 = vld1q_u8(i2);
        const uint8x16_t vi3 = vld1q_u8(i3);
        const uint8x16_t vi4 = vld1q_u8(i4);
        const uint8x16_t vi5 = vld1q_u8(i5);
        const uint8x16_t vi6 = vld1q_u8(i6);
        const uint8x16_t vi7 = vld1q_u8(i7);
        const uint8x16_t vi8 = vld1q_u8(i8);

        const uint8x16_t vmax018 = vmaxq_u8(vmaxq_u8(vi0, vi1), vi8);
        const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3);
        const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5);
        const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7);

        const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45);
        const uint8x16_t vmax01678 = vmaxq_u8(vmax018, vmax67);
        const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax01678);
        const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);

        uint8x8_t vout_lo = vget_low_u8(vout);
        if (c & 8) {
          vst1_u8(o, vout_lo); o += 8;
          vout_lo = vget_high_u8(vout);
        }
        if (c & 4) {
          vst1_lane_u32((void*) o, vreinterpret_u32_u8(vout_lo), 0); o += 4;
          vout_lo = vext_u8(vout_lo, vout_lo, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) o, vreinterpret_u16_u8(vout_lo), 0); o += 2;
          vout_lo = vext_u8(vout_lo, vout_lo, 2);
        }
        if (c & 1) {
          vst1_lane_u8(o, vout_lo, 0); o += 1;
        }
      }
    }

    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
      const uint8_t* i0 = *input++;
      const uint8_t* i1 = *input++;
      const uint8_t* i2 = *input++;
      const uint8_t* i3 = *input++;
      const uint8_t* i4 = *input++;
      const uint8_t* i5 = *input++;
      const uint8_t* i6 = *input++;
      const uint8_t* i7 = *input++;
      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
      if (k < 2) {
        i1 = i0;
      }
      if (k <= 2) {
        i2 = i0;
      }
      if (k < 4) {
        i3 = i0;
      }
      if (k <= 4) {
        i4 = i0;
      }
      if (k < 6) {
        i5 = i0;
      }
      if (k <= 6) {
        i6 = i0;
      }
      if (k < 8) {
        i7 = i0;
      }

      o = output;
      size_t c = channels;
      for (; c >= 16; c -= 16) {
        const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
        const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
        const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
        const uint8x16_t vi3 = vld1q_u8(i3); i3 += 16;
        const uint8x16_t vi4 = vld1q_u8(i4); i4 += 16;
        const uint8x16_t vi5 = vld1q_u8(i5); i5 += 16;
        const uint8x16_t vi6 = vld1q_u8(i6); i6 += 16;
        const uint8x16_t vi7 = vld1q_u8(i7); i7 += 16;
        const uint8x16_t vo = vld1q_u8(o);

        const uint8x16_t vmax01 = vmaxq_u8(vmaxq_u8(vi0, vi1), vo);
        const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3);
        const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5);
        const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7);

        const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45);
        const uint8x16_t vmax0167 = vmaxq_u8(vmax01, vmax67);
        const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax0167);
        const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);

        vst1q_u8(o, vout); o += 16;
      }
      if (c != 0) {
        const uint8x16_t vi0 = vld1q_u8(i0);
        const uint8x16_t vi1 = vld1q_u8(i1);
        const uint8x16_t vi2 = vld1q_u8(i2);
        const uint8x16_t vi3 = vld1q_u8(i3);
        const uint8x16_t vi4 = vld1q_u8(i4);
        const uint8x16_t vi5 = vld1q_u8(i5);
        const uint8x16_t vi6 = vld1q_u8(i6);
        const uint8x16_t vi7 = vld1q_u8(i7);
        const uint8x16_t vo = vld1q_u8(o);

        const uint8x16_t vmax01 = vmaxq_u8(vmaxq_u8(vi0, vi1), vo);
        const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3);
        const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5);
        const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7);

        const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45);
        const uint8x16_t vmax0167 = vmaxq_u8(vmax01, vmax67);
        const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax0167);
        const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);

        uint8x8_t vout_lo = vget_low_u8(vout);
        if (c & 8) {
          vst1_u8(o, vout_lo); o += 8;
          vout_lo = vget_high_u8(vout);
        }
        if (c & 4) {
          vst1_lane_u32((void*) o, vreinterpret_u32_u8(vout_lo), 0); o += 4;
          vout_lo = vext_u8(vout_lo, vout_lo, 4);
        }
        if (c & 2) {
          vst1_lane_u16((void*) o, vreinterpret_u16_u8(vout_lo), 0); o += 2;
          vout_lo = vext_u8(vout_lo, vout_lo, 2);
        }
        if (c & 1) {
          vst1_lane_u8(o, vout_lo, 0); o += 1;
        }
      }
    }
    input = (const uint8_t**) ((uintptr_t) input + input_increment);
    output = (uint8_t*) ((uintptr_t) o + output_increment);
  } while (--output_pixels != 0);
}

void xnn_u8_rmax_ukernel__neon(
    size_t batch,
    const uint8_t* input,
    uint8_t* output)
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  if XNN_LIKELY(batch >= 16) {
    uint8x16_t vmax = vmovq_n_u8(0);
    do {
      const uint8x16_t vx = vld1q_u8(input); input += 16;
      vmax = vmaxq_u8(vmax, vx);
      batch -= 16;
    } while (batch >= 16);
    if (batch != 0) {
      const size_t x_increment = batch - 16;
      input = (const uint8_t*) ((uintptr_t) input + x_increment);
      const uint8x16_t vx = vld1q_u8(input);
      vmax = vmaxq_u8(vmax, vx);
    }
    uint8x8_t vmax8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax));
    const uint8x8_t vmax4 = vpmax_u8(vmax8, vmax8);
    const uint8x8_t vmax2 = vpmax_u8(vmax4, vmax4);
    const uint8x8_t vmax1 = vpmax_u8(vmax2, vmax2);
    vst1_lane_u8(output, vmax1, 0);
  } else {
    uint8x8_t vmax = vmov_n_u8(0);
    do {
      const uint8x8_t vx = vld1_dup_u8(input); input += 1;
      vmax = vmax_u8(vmax, vx);
    } while (--batch != 0);
    vst1_lane_u8(output, vmax, 0);
  }
}

void xnn_u8_vclamp_ukernel__neon_x64(
    size_t batch,
    const uint8_t* input,
    uint8_t* output,
    const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(batch != 0);
  assert(batch % sizeof(uint8_t) == 0);
  assert(input != NULL);
  assert(output != NULL);

  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.max);
  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.min);

  for (; batch >= 64; batch -= 64) {
    uint8x16_t vacc0 = vld1q_u8(input); input += 16;
    uint8x16_t vacc1 = vld1q_u8(input); input += 16;
    uint8x16_t vacc2 = vld1q_u8(input); input += 16;
    uint8x16_t vacc3 = vld1q_u8(input); input += 16;

    vacc0 = vmaxq_u8(vacc0, voutput_min);
    vacc1 = vmaxq_u8(vacc1, voutput_min);
    vacc2 = vmaxq_u8(vacc2, voutput_min);
    vacc3 = vmaxq_u8(vacc3, voutput_min);

    vacc0 = vminq_u8(vacc0, voutput_max);
    vacc1 = vminq_u8(vacc1, voutput_max);
    vacc2 = vminq_u8(vacc2, voutput_max);
    vacc3 = vminq_u8(vacc3, voutput_max);

    vst1q_u8(output, vacc0); output += 16;
    vst1q_u8(output, vacc1); output += 16;
    vst1q_u8(output, vacc2); output += 16;
    vst1q_u8(output, vacc3); output += 16;
  }
  for (; batch >= 8; batch -= 8) {
    uint8x8_t vacc = vld1_u8(input); input += 8;

    vacc = vmin_u8(vacc, vget_low_u8(voutput_max));
    vacc = vmax_u8(vacc, vget_low_u8(voutput_min));

    vst1_u8(output, vacc); output += 8;
  }
  if XNN_UNLIKELY(batch != 0) {
    uint8x8_t vacc = vld1_u8(input); input += 8;

    vacc = vmin_u8(vacc, vget_low_u8(voutput_max));
    vacc = vmax_u8(vacc, vget_low_u8(voutput_min));

    if (batch & 4) {
      vst1_lane_u32((void*) output, vreinterpret_u32_u8(vacc), 0); output += 4;
      vacc = vext_u8(vacc, vacc, 4);
    }
    if (batch & 2) {
      vst1_lane_u16((void*) output, vreinterpret_u16_u8(vacc), 0); output += 2;
      vacc = vext_u8(vacc, vacc, 2);
    }
    if (batch & 1) {
      vst1_lane_u8(output, vacc, 0);
    }
  }
}

void xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_x8_prfm(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* weights,
  const uint16_t* bias,
  uint16_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 16);
  assert(kr == 1);
  assert(sr == 1);
  assert(weights != NULL);
  assert(packed_weights != NULL);

  uint16x8x4_t vtmp0123x01234567;
  vtmp0123x01234567.val[0] = vdupq_n_u16(0);
  vtmp0123x01234567.val[1] = vdupq_n_u16(0);
  vtmp0123x01234567.val[2] = vdupq_n_u16(0);
  vtmp0123x01234567.val[3] = vdupq_n_u16(0);
  uint16x8x4_t vtmp4567x01234567;
  vtmp4567x01234567.val[0] = vdupq_n_u16(0);
  vtmp4567x01234567.val[1] = vdupq_n_u16(0);
  vtmp4567x01234567.val[2] = vdupq_n_u16(0);
  vtmp4567x01234567.val[3] = vdupq_n_u16(0);
  uint16x8x4_t vtmp0123x89ABCDEF;
  vtmp0123x89ABCDEF.val[0] = vdupq_n_u16(0);
  vtmp0123x89ABCDEF.val[1] = vdupq_n_u16(0);
  vtmp0123x89ABCDEF.val[2] = vdupq_n_u16(0);
  vtmp0123x89ABCDEF.val[3] = vdupq_n_u16(0);
  uint16x8x4_t vtmp4567x89ABCDEF;
  vtmp4567x89ABCDEF.val[0] = vdupq_n_u16(0);
  vtmp4567x89ABCDEF.val[1] = vdupq_n_u16(0);
  vtmp4567x89ABCDEF.val[2] = vdupq_n_u16(0);
  vtmp4567x89ABCDEF.val[3] = vdupq_n_u16(0);

  do {
    // NC main loop multiple of 16
    const uint16_t* w0 = weights;
    size_t n = nc;

    for (; n >= 16; n -= 16) {
      if XNN_LIKELY(bias != NULL) {
        uint16x8_t vb0 = vld1q_u16(bias); bias += 8;
        uint16x8_t vb8 = vld1q_u16(bias); bias += 8;
        vst1q_u16(packed_weights, vb0); packed_weights += 8;
        vst1q_u16(packed_weights, vb8); packed_weights += 8;
      } else {
        const uint16x8_t vzero = vmovq_n_u16(0);
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
      }

      const uint16_t* w1 = w0 + kc;
      const uint16_t* w2 = w1 + kc;
      const uint16_t* w3 = w2 + kc;
      const uint16_t* w4 = w3 + kc;
      const uint16_t* w5 = w4 + kc;
      const uint16_t* w6 = w5 + kc;
      const uint16_t* w7 = w6 + kc;
      const uint16_t* w8 = w7 + kc;
      const uint16_t* w9 = w8 + kc;
      const uint16_t* w10 = w9 + kc;
      const uint16_t* w11 = w10 + kc;
      const uint16_t* w12 = w11 + kc;
      const uint16_t* w13 = w12 + kc;
      const uint16_t* w14 = w13 + kc;
      const uint16_t* w15 = w14 + kc;
      xnn_prefetch_to_l1((const int8_t*) w0);
      xnn_prefetch_to_l1((const int8_t*) w0 + 64);
      xnn_prefetch_to_l1((const int8_t*) w1);
      xnn_prefetch_to_l1((const int8_t*) w1 + 64);
      xnn_prefetch_to_l1((const int8_t*) w2);
      xnn_prefetch_to_l1((const int8_t*) w2 + 64);
      xnn_prefetch_to_l1((const int8_t*) w3);
      xnn_prefetch_to_l1((const int8_t*) w3 + 64);
      xnn_prefetch_to_l1((const int8_t*) w4);
      xnn_prefetch_to_l1((const int8_t*) w4 + 64);
      xnn_prefetch_to_l1((const int8_t*) w5);
      xnn_prefetch_to_l1((const int8_t*) w5 + 64);
      xnn_prefetch_to_l1((const int8_t*) w6);
      xnn_prefetch_to_l1((const int8_t*) w6 + 64);
      xnn_prefetch_to_l1((const int8_t*) w7);
      xnn_prefetch_to_l1((const int8_t*) w7 + 64);
      xnn_prefetch_to_l1((const int8_t*) w8);
      xnn_prefetch_to_l1((const int8_t*) w8 + 64);
      xnn_prefetch_to_l1((const int8_t*) w9);
      xnn_prefetch_to_l1((const int8_t*) w9 + 64);
      xnn_prefetch_to_l1((const int8_t*) w10);
      xnn_prefetch_to_l1((const int8_t*) w10 + 64);
      xnn_prefetch_to_l1((const int8_t*) w11);
      xnn_prefetch_to_l1((const int8_t*) w11 + 64);
      xnn_prefetch_to_l1((const int8_t*) w12);
      xnn_prefetch_to_l1((const int8_t*) w12 + 64);
      xnn_prefetch_to_l1((const int8_t*) w13);
      xnn_prefetch_to_l1((const int8_t*) w13 + 64);
      xnn_prefetch_to_l1((const int8_t*) w14);
      xnn_prefetch_to_l1((const int8_t*) w14 + 64);
      xnn_prefetch_to_l1((const int8_t*) w15);
      xnn_prefetch_to_l1((const int8_t*) w15 + 64);

      // KC main loop multiple of 8
      size_t k = kc;
      for (; k >= 8; k -= 8) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w15, vtmp0123x89ABCDEF, 7); w15 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w8, vtmp4567x89ABCDEF, 0); w8 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w9, vtmp4567x89ABCDEF, 1); w9 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w10, vtmp4567x89ABCDEF, 2); w10 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w11, vtmp4567x89ABCDEF, 3); w11 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w12, vtmp4567x89ABCDEF, 4); w12 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w13, vtmp4567x89ABCDEF, 5); w13 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w14, vtmp4567x89ABCDEF, 6); w14 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w15, vtmp4567x89ABCDEF, 7); w15 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        xnn_prefetch_to_l1((const int8_t*) w7 + 128);
        xnn_prefetch_to_l1((const int8_t*) w8 + 128);
        xnn_prefetch_to_l1((const int8_t*) w9 + 128);
        xnn_prefetch_to_l1((const int8_t*) w10 + 128);
        xnn_prefetch_to_l1((const int8_t*) w11 + 128);
        xnn_prefetch_to_l1((const int8_t*) w12 + 128);
        xnn_prefetch_to_l1((const int8_t*) w13 + 128);
        xnn_prefetch_to_l1((const int8_t*) w14 + 128);
        xnn_prefetch_to_l1((const int8_t*) w15 + 128);
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[3]); packed_weights += 8;
      }
      // KC remainder multiple of 4
      if (k >= 4) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w15, vtmp0123x89ABCDEF, 7); w15 += 4;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8;
        k -= 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 16x1
          case 1:
          {
            uint16x8_t vtmp0x01234567 = vdupq_n_u16(0);
            uint16x8_t vtmp0x89ABCDEF = vdupq_n_u16(0);

            vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w8, vtmp0x89ABCDEF, 0); w8 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w9, vtmp0x89ABCDEF, 1); w9 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w10, vtmp0x89ABCDEF, 2); w10 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w11, vtmp0x89ABCDEF, 3); w11 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w12, vtmp0x89ABCDEF, 4); w12 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w13, vtmp0x89ABCDEF, 5); w13 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w14, vtmp0x89ABCDEF, 6); w14 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w15, vtmp0x89ABCDEF, 7); w15 += 1;

            vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp0x89ABCDEF); packed_weights += 8;
            break;
          }
          // KC remainder of 16x2
          case 2:
          {
            uint16x8x2_t vtmp01x01234567;
            vtmp01x01234567.val[0] = vdupq_n_u16(0);
            vtmp01x01234567.val[1] = vdupq_n_u16(0);
            uint16x8x2_t vtmp01x89ABCDEF;
            vtmp01x89ABCDEF.val[0] = vdupq_n_u16(0);
            vtmp01x89ABCDEF.val[1] = vdupq_n_u16(0);

            vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w8, vtmp01x89ABCDEF, 0); w8 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w9, vtmp01x89ABCDEF, 1); w9 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w10, vtmp01x89ABCDEF, 2); w10 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w11, vtmp01x89ABCDEF, 3); w11 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w12, vtmp01x89ABCDEF, 4); w12 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w13, vtmp01x89ABCDEF, 5); w13 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w14, vtmp01x89ABCDEF, 6); w14 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w15, vtmp01x89ABCDEF, 7); w15 += 2;

            vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[1]); packed_weights += 8;
            break;
          }
          // KC remainder of 16x3
          case 3:
          {
            uint16x8x3_t vtmp012x01234567;
            vtmp012x01234567.val[0] = vdupq_n_u16(0);
            vtmp012x01234567.val[1] = vdupq_n_u16(0);
            vtmp012x01234567.val[2] = vdupq_n_u16(0);
            uint16x8x3_t vtmp012x89ABCDEF;
            vtmp012x89ABCDEF.val[0] = vdupq_n_u16(0);
            vtmp012x89ABCDEF.val[1] = vdupq_n_u16(0);
            vtmp012x89ABCDEF.val[2] = vdupq_n_u16(0);

            vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w8, vtmp012x89ABCDEF, 0); w8 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w9, vtmp012x89ABCDEF, 1); w9 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w10, vtmp012x89ABCDEF, 2); w10 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w11, vtmp012x89ABCDEF, 3); w11 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w12, vtmp012x89ABCDEF, 4); w12 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w13, vtmp012x89ABCDEF, 5); w13 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w14, vtmp012x89ABCDEF, 6); w14 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w15, vtmp012x89ABCDEF, 7); w15 += 3;

            vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[2]); packed_weights += 8;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
      w0 = w15;
    }

    // NC remainder (1..15)
    if XNN_UNLIKELY(n != 0) {
      assert(n >= 1);
      assert(n <= 15);
      if XNN_LIKELY(bias != NULL) {
        size_t nb = n;
        do {
          *packed_weights++  = *bias++;
        } while (--nb != 0);
        packed_weights += (16 - n);
      } else {
        const uint16x8_t vzero = vmovq_n_u16(0);
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
      }

      // NR remainder has less than 16 rows so last row is not loaded
      const uint16_t* w1 = w0 + kc;
      if XNN_UNPREDICTABLE(n < 2) {
        w1 = w0;
      }
      const uint16_t* w2 = w1 + kc;
      if XNN_UNPREDICTABLE(n <= 2) {
        w2 = w1;
      }
      const uint16_t* w3 = w2 + kc;
      if XNN_UNPREDICTABLE(n < 4) {
        w3 = w2;
      }
      const uint16_t* w4 = w3 + kc;
      if XNN_UNPREDICTABLE(n <= 4) {
        w4 = w3;
      }
      const uint16_t* w5 = w4 + kc;
      if XNN_UNPREDICTABLE(n < 6) {
        w5 = w4;
      }
      const uint16_t* w6 = w5 + kc;
      if XNN_UNPREDICTABLE(n <= 6) {
        w6 = w5;
      }
      const uint16_t* w7 = w6 + kc;
      if XNN_UNPREDICTABLE(n < 8) {
        w7 = w6;
      }
      const uint16_t* w8 = w7 + kc;
      if XNN_UNPREDICTABLE(n <= 8) {
        w8 = w7;
      }
      const uint16_t* w9 = w8 + kc;
      if XNN_UNPREDICTABLE(n < 10) {
        w9 = w8;
      }
      const uint16_t* w10 = w9 + kc;
      if XNN_UNPREDICTABLE(n <= 10) {
        w10 = w9;
      }
      const uint16_t* w11 = w10 + kc;
      if XNN_UNPREDICTABLE(n < 12) {
        w11 = w10;
      }
      const uint16_t* w12 = w11 + kc;
      if XNN_UNPREDICTABLE(n <= 12) {
        w12 = w11;
      }
      const uint16_t* w13 = w12 + kc;
      if XNN_UNPREDICTABLE(n < 14) {
        w13 = w12;
      }
      const uint16_t* w14 = w13 + kc;
      if XNN_UNPREDICTABLE(n <= 14) {
        w14 = w13;
      }

      // KC main loop multiple of 8
      size_t k = kc;
      for (; k >= 8; k -= 8) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w8, vtmp4567x89ABCDEF, 0); w8 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w9, vtmp4567x89ABCDEF, 1); w9 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w10, vtmp4567x89ABCDEF, 2); w10 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w11, vtmp4567x89ABCDEF, 3); w11 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w12, vtmp4567x89ABCDEF, 4); w12 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w13, vtmp4567x89ABCDEF, 5); w13 += 4;
        vtmp4567x89ABCDEF = vld4q_lane_u16(w14, vtmp4567x89ABCDEF, 6); w14 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        xnn_prefetch_to_l1((const int8_t*) w7 + 128);
        xnn_prefetch_to_l1((const int8_t*) w8 + 128);
        xnn_prefetch_to_l1((const int8_t*) w9 + 128);
        xnn_prefetch_to_l1((const int8_t*) w10 + 128);
        xnn_prefetch_to_l1((const int8_t*) w11 + 128);
        xnn_prefetch_to_l1((const int8_t*) w12 + 128);
        xnn_prefetch_to_l1((const int8_t*) w13 + 128);
        xnn_prefetch_to_l1((const int8_t*) w14 + 128);
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[3]); packed_weights += 8;
      }

      // KC remainder multiple of 4
      if (k >= 4) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4;
        vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8;
        k -= 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 16x1
          case 1:
          {
            uint16x8_t vtmp0x01234567 = vdupq_n_u16(0);
            uint16x8_t vtmp0x89ABCDEF = vdupq_n_u16(0);

            vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w8, vtmp0x89ABCDEF, 0); w8 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w9, vtmp0x89ABCDEF, 1); w9 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w10, vtmp0x89ABCDEF, 2); w10 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w11, vtmp0x89ABCDEF, 3); w11 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w12, vtmp0x89ABCDEF, 4); w12 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w13, vtmp0x89ABCDEF, 5); w13 += 1;
            vtmp0x89ABCDEF = vld1q_lane_u16(w14, vtmp0x89ABCDEF, 6); w14 += 1;

            vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp0x89ABCDEF); packed_weights += 8;
            break;
          }
          // KC remainder of 16x2
          case 2:
          {
            uint16x8x2_t vtmp01x01234567;
            vtmp01x01234567.val[0] = vdupq_n_u16(0);
            vtmp01x01234567.val[1] = vdupq_n_u16(0);
            uint16x8x2_t vtmp01x89ABCDEF;
            vtmp01x89ABCDEF.val[0] = vdupq_n_u16(0);
            vtmp01x89ABCDEF.val[1] = vdupq_n_u16(0);

            vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w8, vtmp01x89ABCDEF, 0); w8 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w9, vtmp01x89ABCDEF, 1); w9 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w10, vtmp01x89ABCDEF, 2); w10 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w11, vtmp01x89ABCDEF, 3); w11 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w12, vtmp01x89ABCDEF, 4); w12 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w13, vtmp01x89ABCDEF, 5); w13 += 2;
            vtmp01x89ABCDEF = vld2q_lane_u16(w14, vtmp01x89ABCDEF, 6); w14 += 2;

            vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[1]); packed_weights += 8;
            break;
          }
          // KC remainder of 16x3
          case 3:
          {
            uint16x8x3_t vtmp012x01234567;
            vtmp012x01234567.val[0] = vdupq_n_u16(0);
            vtmp012x01234567.val[1] = vdupq_n_u16(0);
            vtmp012x01234567.val[2] = vdupq_n_u16(0);
            uint16x8x3_t vtmp012x89ABCDEF;
            vtmp012x89ABCDEF.val[0] = vdupq_n_u16(0);
            vtmp012x89ABCDEF.val[1] = vdupq_n_u16(0);
            vtmp012x89ABCDEF.val[2] = vdupq_n_u16(0);

            vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w8, vtmp012x89ABCDEF, 0); w8 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w9, vtmp012x89ABCDEF, 1); w9 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w10, vtmp012x89ABCDEF, 2); w10 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w11, vtmp012x89ABCDEF, 3); w11 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w12, vtmp012x89ABCDEF, 4); w12 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w13, vtmp012x89ABCDEF, 5); w13 += 3;
            vtmp012x89ABCDEF = vld3q_lane_u16(w14, vtmp012x89ABCDEF, 6); w14 += 3;

            vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[2]); packed_weights += 8;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
    }

    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_x8_prfm(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint16_t* weights,
  const uint16_t* bias,
  uint16_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 8);
  assert(kr == 1);
  assert(sr == 1);
  assert(weights != NULL);
  assert(packed_weights != NULL);

  uint16x8x4_t vtmp0123x01234567;
  vtmp0123x01234567.val[0] = vdupq_n_u16(0);
  vtmp0123x01234567.val[1] = vdupq_n_u16(0);
  vtmp0123x01234567.val[2] = vdupq_n_u16(0);
  vtmp0123x01234567.val[3] = vdupq_n_u16(0);
  uint16x8x4_t vtmp4567x01234567;
  vtmp4567x01234567.val[0] = vdupq_n_u16(0);
  vtmp4567x01234567.val[1] = vdupq_n_u16(0);
  vtmp4567x01234567.val[2] = vdupq_n_u16(0);
  vtmp4567x01234567.val[3] = vdupq_n_u16(0);

  do {
    // NC main loop multiple of 8
    const uint16_t* w0 = weights;
    size_t n = nc;

    for (; n >= 8; n -= 8) {
      if XNN_LIKELY(bias != NULL) {
        uint16x8_t vb0 = vld1q_u16(bias); bias += 8;
        vst1q_u16(packed_weights, vb0); packed_weights += 8;
      } else {
        const uint16x8_t vzero = vmovq_n_u16(0);
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
      }

      const uint16_t* w1 = w0 + kc;
      const uint16_t* w2 = w1 + kc;
      const uint16_t* w3 = w2 + kc;
      const uint16_t* w4 = w3 + kc;
      const uint16_t* w5 = w4 + kc;
      const uint16_t* w6 = w5 + kc;
      const uint16_t* w7 = w6 + kc;
      xnn_prefetch_to_l1((const int8_t*) w0);
      xnn_prefetch_to_l1((const int8_t*) w0 + 64);
      xnn_prefetch_to_l1((const int8_t*) w1);
      xnn_prefetch_to_l1((const int8_t*) w1 + 64);
      xnn_prefetch_to_l1((const int8_t*) w2);
      xnn_prefetch_to_l1((const int8_t*) w2 + 64);
      xnn_prefetch_to_l1((const int8_t*) w3);
      xnn_prefetch_to_l1((const int8_t*) w3 + 64);
      xnn_prefetch_to_l1((const int8_t*) w4);
      xnn_prefetch_to_l1((const int8_t*) w4 + 64);
      xnn_prefetch_to_l1((const int8_t*) w5);
      xnn_prefetch_to_l1((const int8_t*) w5 + 64);
      xnn_prefetch_to_l1((const int8_t*) w6);
      xnn_prefetch_to_l1((const int8_t*) w6 + 64);
      xnn_prefetch_to_l1((const int8_t*) w7);
      xnn_prefetch_to_l1((const int8_t*) w7 + 64);

      // KC main loop multiple of 8
      size_t k = kc;
      for (; k >= 8; k -= 8) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        xnn_prefetch_to_l1((const int8_t*) w7 + 128);
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8;
      }
      // KC remainder multiple of 4
      if (k >= 4) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        k -= 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 8x1
          case 1:
          {
            uint16x8_t vtmp0x01234567 = vdupq_n_u16(0);

            vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1;

            vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8;
            break;
          }
          // KC remainder of 8x2
          case 2:
          {
            uint16x8x2_t vtmp01x01234567;
            vtmp01x01234567.val[0] = vdupq_n_u16(0);
            vtmp01x01234567.val[1] = vdupq_n_u16(0);

            vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2;

            vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8;
            break;
          }
          // KC remainder of 8x3
          case 3:
          {
            uint16x8x3_t vtmp012x01234567;
            vtmp012x01234567.val[0] = vdupq_n_u16(0);
            vtmp012x01234567.val[1] = vdupq_n_u16(0);
            vtmp012x01234567.val[2] = vdupq_n_u16(0);

            vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3;

            vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
      w0 = w7;
    }

    // NC remainder (1..7)
    if XNN_UNLIKELY(n != 0) {
      assert(n >= 1);
      assert(n <= 7);
      if XNN_LIKELY(bias != NULL) {
        size_t nb = n;
        do {
          *packed_weights++  = *bias++;
        } while (--nb != 0);
        packed_weights += (8 - n);
      } else {
        const uint16x8_t vzero = vmovq_n_u16(0);
        vst1q_u16(packed_weights, vzero); packed_weights += 8;
      }

      // NR remainder has less than 8 rows so last row is not loaded
      const uint16_t* w1 = w0 + kc;
      if XNN_UNPREDICTABLE(n < 2) {
        w1 = w0;
      }
      const uint16_t* w2 = w1 + kc;
      if XNN_UNPREDICTABLE(n <= 2) {
        w2 = w1;
      }
      const uint16_t* w3 = w2 + kc;
      if XNN_UNPREDICTABLE(n < 4) {
        w3 = w2;
      }
      const uint16_t* w4 = w3 + kc;
      if XNN_UNPREDICTABLE(n <= 4) {
        w4 = w3;
      }
      const uint16_t* w5 = w4 + kc;
      if XNN_UNPREDICTABLE(n < 6) {
        w5 = w4;
      }
      const uint16_t* w6 = w5 + kc;
      if XNN_UNPREDICTABLE(n <= 6) {
        w6 = w5;
      }

      // KC main loop multiple of 8
      size_t k = kc;
      for (; k >= 8; k -= 8) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4;
        vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8;
      }

      // KC remainder multiple of 4
      if (k >= 4) {
        vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4;
        vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8;
        vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8;
        k -= 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 8x1
          case 1:
          {
            uint16x8_t vtmp0x01234567 = vdupq_n_u16(0);

            vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1;
            vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1;

            vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8;
            break;
          }
          // KC remainder of 8x2
          case 2:
          {
            uint16x8x2_t vtmp01x01234567;
            vtmp01x01234567.val[0] = vdupq_n_u16(0);
            vtmp01x01234567.val[1] = vdupq_n_u16(0);

            vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2;
            vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2;

            vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8;
            break;
          }
          // KC remainder of 8x3
          case 3:
          {
            uint16x8x3_t vtmp012x01234567;
            vtmp012x01234567.val[0] = vdupq_n_u16(0);
            vtmp012x01234567.val[1] = vdupq_n_u16(0);
            vtmp012x01234567.val[2] = vdupq_n_u16(0);

            vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3;
            vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3;

            vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8;
            vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
    }

    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon(
    const uint16_t* input,
    uint16_t* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height,
    const union xnn_x16_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_stride >= block_height * sizeof(uint16_t));
  assert(input_stride >= block_width * sizeof(uint16_t));

  const size_t tile_height = 8;
  const size_t tile_width = 8;
  const size_t tile_hbytes = tile_height * sizeof(uint16_t);
  const size_t tile_wbytes = tile_width * sizeof(uint16_t);
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes;

  const uint16_t* i0 = input;
  uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes);
  const size_t minus_output_stride = -output_stride;

  do {
    const size_t rem = min(block_width - 1, 7);
    const size_t oN_stride = rem * output_stride;
    const size_t oN_offset = oN_stride + tile_hbytes;
    size_t bh = block_height;
    for (; bh >= 8; bh -= 8) {
      const uint16x8_t v3_0 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_1 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_2 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_3 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_4 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_5 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_6 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
      const uint16x8_t v3_7 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);

      const uint16x8x2_t v2_0 = vzipq_u16(v3_0, v3_4);
      const uint16x8x2_t v2_1 = vzipq_u16(v3_1, v3_5);
      const uint16x8x2_t v2_2 = vzipq_u16(v3_2, v3_6);
      const uint16x8x2_t v2_3 = vzipq_u16(v3_3, v3_7);

      const uint16x8x2_t v1_0 = vzipq_u16(v2_0.val[0], v2_2.val[0]);
      const uint16x8x2_t v1_1 = vzipq_u16(v2_0.val[1], v2_2.val[1]);
      const uint16x8x2_t v1_2 = vzipq_u16(v2_1.val[0], v2_3.val[0]);
      const uint16x8x2_t v1_3 = vzipq_u16(v2_1.val[1], v2_3.val[1]);
      const uint16x8x2_t v0_0 = vzipq_u16(v1_0.val[0], v1_2.val[0]);
      const uint16x8x2_t v0_1 = vzipq_u16(v1_0.val[1], v1_2.val[1]);
      const uint16x8x2_t v0_2 = vzipq_u16(v1_1.val[0], v1_3.val[0]);
      const uint16x8x2_t v0_3 = vzipq_u16(v1_1.val[1], v1_3.val[1]);

      o = (uint16_t*) ((uintptr_t) o + oN_offset);
      vst1q_u16(o, v0_3.val[1]);
      if XNN_UNPREDICTABLE(block_width > 7) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_3.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 7) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_2.val[1]);
      if XNN_UNPREDICTABLE(block_width > 5) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_2.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 5) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_1.val[1]);
      if XNN_UNPREDICTABLE(block_width > 3) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_1.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 3) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_0.val[1]);
      if XNN_UNPREDICTABLE(block_width > 1) {
        o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u16(o, v0_0.val[0]);
    }
    o = (uint16_t*) ((uintptr_t) o + tile_hbytes);

    if (bh != 0) {
      const uint16x8_t v3_0 = vld1q_u16(i0);
      const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
      if XNN_UNPREDICTABLE(bh < 2) {
        i1 = i0;
      }
      const uint16x8_t v3_1 = vld1q_u16(i1);
      const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 2) {
        i2 = i1;
      }
      const uint16x8_t v3_2 = vld1q_u16(i2);
      const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
      if XNN_UNPREDICTABLE(bh < 4) {
        i3 = i2;
      }
      const uint16x8_t v3_3 = vld1q_u16(i3);
      const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 4) {
        i4 = i3;
      }
      const uint16x8_t v3_4 = vld1q_u16(i4);
      const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
      if XNN_UNPREDICTABLE(bh < 6) {
        i5 = i4;
      }
      const uint16x8_t v3_5 = vld1q_u16(i5);
      const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 6) {
        i6 = i5;
      }
      const uint16x8_t v3_6 = vld1q_u16(i6);
      const uint16x8_t v3_7 = vmovq_n_u16(0);

      const uint16x8x2_t v2_0 = vzipq_u16(v3_0, v3_4);
      const uint16x8x2_t v2_1 = vzipq_u16(v3_1, v3_5);
      const uint16x8x2_t v2_2 = vzipq_u16(v3_2, v3_6);
      const uint16x8x2_t v2_3 = vzipq_u16(v3_3, v3_7);

      const uint16x8x2_t v1_0 = vzipq_u16(v2_0.val[0], v2_2.val[0]);
      const uint16x8x2_t v1_1 = vzipq_u16(v2_0.val[1], v2_2.val[1]);
      const uint16x8x2_t v1_2 = vzipq_u16(v2_1.val[0], v2_3.val[0]);
      const uint16x8x2_t v1_3 = vzipq_u16(v2_1.val[1], v2_3.val[1]);
      const uint16x8x2_t v0_0 = vzipq_u16(v1_0.val[0], v1_2.val[0]);
      const uint16x8x2_t v0_1 = vzipq_u16(v1_0.val[1], v1_2.val[1]);
      const uint16x8x2_t v0_2 = vzipq_u16(v1_1.val[0], v1_3.val[0]);
      const uint16x8x2_t v0_3 = vzipq_u16(v1_1.val[1], v1_3.val[1]);

      uint16x4_t v0_low = vget_low_u16(v0_0.val[0]);
      uint16x4_t v1_low = vget_low_u16(v0_0.val[1]);
      uint16x4_t v2_low = vget_low_u16(v0_1.val[0]);
      uint16x4_t v3_low = vget_low_u16(v0_1.val[1]);
      uint16x4_t v4_low = vget_low_u16(v0_2.val[0]);
      uint16x4_t v5_low = vget_low_u16(v0_2.val[1]);
      uint16x4_t v6_low = vget_low_u16(v0_3.val[0]);
      uint16x4_t v7_low = vget_low_u16(v0_3.val[1]);

      if (bh & 4) {
        o = (uint16_t*) ((uintptr_t) o + oN_stride);
        vst1_u16(o, v7_low);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v6_low);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v5_low);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v4_low);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v3_low);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v2_low);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v1_low);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u16(o, v0_low); o += 4;
        v0_low = vget_high_u16(v0_0.val[0]);
        v1_low = vget_high_u16(v0_0.val[1]);
        v2_low = vget_high_u16(v0_1.val[0]);
        v3_low = vget_high_u16(v0_1.val[1]);
        v4_low = vget_high_u16(v0_2.val[0]);
        v5_low = vget_high_u16(v0_2.val[1]);
        v6_low = vget_high_u16(v0_3.val[0]);
        v7_low = vget_high_u16(v0_3.val[1]);
      }

      if (bh & 2) {
        o = (uint16_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2;
        v0_low = vext_u16(v0_low, v0_low, 2);
        v1_low = vext_u16(v1_low, v1_low, 2);
        v2_low = vext_u16(v2_low, v2_low, 2);
        v3_low = vext_u16(v3_low, v3_low, 2);
        v4_low = vext_u16(v4_low, v4_low, 2);
        v5_low = vext_u16(v5_low, v5_low, 2);
        v6_low = vext_u16(v6_low, v6_low, 2);
        v7_low = vext_u16(v7_low, v7_low, 2);
      }
      if (bh & 1) {
        o = (uint16_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u16(o, v7_low, 0);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v6_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v5_low, 0);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v4_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v3_low, 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v2_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v1_low, 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint16_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16(o, v0_low, 0);
      }
    }

    i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
    o = (uint16_t*) ((uintptr_t) o + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}

void xnn_x24_transposec_ukernel__2x2_neon_tbl64(
    const void* input,
    void* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height,
    const union xnn_x24_transpose_params* params) XNN_OOB_READS
{
  assert(output_stride >= block_height * 3);
  assert(input_stride >= block_width * 3);

  const size_t tile_height = 2;
  const size_t tile_width = 2;
  const size_t tile_wbytes = tile_width * 3;
  const size_t tile_wbytes_minus_4 = tile_wbytes - 4;
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  const size_t output_reset = tile_height * output_stride - block_height * 3;

  const size_t tile_stride = tile_height * input_stride;

  const uint8_t* i0 = (const uint8_t*) input;
  const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);

  uint8_t* o0 = (uint8_t*) output;
  uint8_t* o1 = (uint8_t*) ((uintptr_t) o0 + output_stride);

  const uint8x8_t vperm0 = vld1_u8(params->neon_tbl64.pos0);
  const uint8x8_t vperm1 = vld1_u8(params->neon_tbl64.pos1);
  do {
    if XNN_UNPREDICTABLE(block_width < 2) {
      o1 = o0;
    }
    size_t bh = block_height;
    for (; bh >= 2; bh -= 2) {
      uint8x8x2_t v;
      v.val[0] = vld1_u8(i0); i0 = (const uint8_t*) ((uintptr_t) i0 + tile_stride);
      v.val[1] = vld1_u8(i1); i1 = (const uint8_t*) ((uintptr_t) i1 + tile_stride);

      const uint8x8_t vres0 = vtbl2_u8(v, vperm0);
      const uint8x8_t vres1 = vtbl2_u8(v, vperm1);

      vst1_lane_u32((void*) o1, vreinterpret_u32_u8(vres1), 0); o1 = (uint8_t*) ((uintptr_t) o1 + 4);
      vst1_lane_u32((void*) o0, vreinterpret_u32_u8(vres0), 0); o0 = (uint8_t*) ((uintptr_t) o0 + 4);
      vst1_lane_u16((void*) o1, vreinterpret_u16_u8(vres1), 2); o1 = (uint8_t*) ((uintptr_t) o1 + tile_wbytes_minus_4);
      vst1_lane_u16((void*) o0, vreinterpret_u16_u8(vres0), 2); o0 = (uint8_t*) ((uintptr_t) o0 + tile_wbytes_minus_4);
    }

    if (bh != 0) {
      if XNN_UNPREDICTABLE(bh < 2) {
        i1 = i0;
      }
      uint8x8_t v = vld1_u8(i0);

      const uint8x8_t vres0 = vtbl1_u8(v, vperm0);
      const uint8x8_t vres1 = vtbl1_u8(v, vperm1);

      if (bh & 1) {
        vst1_lane_u16((void*) o1, vreinterpret_u16_u8(vres1), 0); o1 += 2;
        vst1_lane_u16((void*) o0, vreinterpret_u16_u8(vres0), 0); o0 += 2;
        vst1_lane_u8(o1, vres1, 2); o1 += 1;
        vst1_lane_u8(o0, vres0, 2); o0 += 1;
      }
    }
    i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
    i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
    o0 = (uint8_t*) ((uintptr_t) o0 + output_reset);
    o1 = (uint8_t*) ((uintptr_t) o1 + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}

void xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_x2_prfm(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint32_t* weights,
  const uint32_t* bias,
  uint32_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 2);
  assert(kr == 1);
  assert(sr == 1);
  assert(weights != NULL);
  assert(packed_weights != NULL);

  uint32x2x2_t v00;
  v00.val[0] = vdup_n_u32(0);
  v00.val[1] = vdup_n_u32(0);

  do {
    // NC main loop multiple of 2
    const uint32_t* w0 = weights;
    size_t n = nc;

    for (; n >= 2; n -= 2) {
      if XNN_LIKELY(bias != NULL) {
        uint32x2_t vb0 = vld1_u32(bias); bias += 2;
        vst1_u32(packed_weights, vb0); packed_weights += 2;
      } else {
        const uint32x2_t vzero = vmov_n_u32(0);
        vst1_u32(packed_weights, vzero); packed_weights += 2;
      }

      const uint32_t* w1 = w0 + kc;
      xnn_prefetch_to_l1((const int8_t*) w0);
      xnn_prefetch_to_l1((const int8_t*) w0 + 64);
      xnn_prefetch_to_l1((const int8_t*) w1);
      xnn_prefetch_to_l1((const int8_t*) w1 + 64);

      // KC main loop multiple of 2
      size_t k = kc;
      for (; k >= 2; k -= 2) {
        v00 = vld2_lane_u32(w0, v00, 0); w0 += 2;
        v00 = vld2_lane_u32(w1, v00, 1); w1 += 2;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        vst1_u32(packed_weights + 0, v00.val[0]);
        vst1_u32(packed_weights + 2, v00.val[1]);
        packed_weights += 4;
      }

      // KC remainder
      for (; k != 0; --k) {
        v00.val[0] = vld1_lane_u32(w0, v00.val[0], 0);  w0 += 1;
        v00.val[0] = vld1_lane_u32(w1, v00.val[0], 1);  w1 += 1;
        vst1_u32(packed_weights + 0, v00.val[0]);
        packed_weights += 2;
      }
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
      w0 = w1;
    }

    if XNN_UNLIKELY(n != 0) {
      // NC remainder of 1
      if XNN_LIKELY(bias != NULL) {
        *packed_weights = *bias++;
      } else {
        const uint32x2_t vzero = vmov_n_u32(0);
        vst1_u32(packed_weights + 0, vzero);
      }
      packed_weights += 2;
      size_t k = kc;
      do {
        *packed_weights = *w0++;
        packed_weights += 2;
      } while (--k);
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
    }
    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_x4_prfm(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint32_t* weights,
  const uint32_t* bias,
  uint32_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 8);
  assert(kr == 1);
  assert(sr == 1);
  assert(weights != NULL);
  assert(packed_weights != NULL);
  uint32x4x4_t vtmp0123x0123;
  vtmp0123x0123.val[0] = vdupq_n_u32(0);
  vtmp0123x0123.val[1] = vdupq_n_u32(0);
  vtmp0123x0123.val[2] = vdupq_n_u32(0);
  vtmp0123x0123.val[3] = vdupq_n_u32(0);
  uint32x4x4_t vtmp0123x4567;
  vtmp0123x4567.val[0] = vdupq_n_u32(0);
  vtmp0123x4567.val[1] = vdupq_n_u32(0);
  vtmp0123x4567.val[2] = vdupq_n_u32(0);
  vtmp0123x4567.val[3] = vdupq_n_u32(0);

  do {
    // NC main loop multiple of 8
    const uint32_t* w0 = weights;
    size_t n = nc;

    for (; n >= 8; n -= 8) {
      if XNN_LIKELY(bias != NULL) {
        uint32x4_t vb0 = vld1q_u32(bias); bias += 4;
        uint32x4_t vb4 = vld1q_u32(bias); bias += 4;
        vst1q_u32(packed_weights, vb0); packed_weights += 4;
        vst1q_u32(packed_weights, vb4); packed_weights += 4;
      } else {
        const uint32x4_t vzero = vmovq_n_u32(0);
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
      }

      const uint32_t* w1 = w0 + kc;
      const uint32_t* w2 = w1 + kc;
      const uint32_t* w3 = w2 + kc;
      const uint32_t* w4 = w3 + kc;
      const uint32_t* w5 = w4 + kc;
      const uint32_t* w6 = w5 + kc;
      const uint32_t* w7 = w6 + kc;
      xnn_prefetch_to_l1((const int8_t*) w0);
      xnn_prefetch_to_l1((const int8_t*) w0 + 64);
      xnn_prefetch_to_l1((const int8_t*) w1);
      xnn_prefetch_to_l1((const int8_t*) w1 + 64);
      xnn_prefetch_to_l1((const int8_t*) w2);
      xnn_prefetch_to_l1((const int8_t*) w2 + 64);
      xnn_prefetch_to_l1((const int8_t*) w3);
      xnn_prefetch_to_l1((const int8_t*) w3 + 64);
      xnn_prefetch_to_l1((const int8_t*) w4);
      xnn_prefetch_to_l1((const int8_t*) w4 + 64);
      xnn_prefetch_to_l1((const int8_t*) w5);
      xnn_prefetch_to_l1((const int8_t*) w5 + 64);
      xnn_prefetch_to_l1((const int8_t*) w6);
      xnn_prefetch_to_l1((const int8_t*) w6 + 64);
      xnn_prefetch_to_l1((const int8_t*) w7);
      xnn_prefetch_to_l1((const int8_t*) w7 + 64);

      // KC main loop multiple of 4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w7, vtmp0123x4567, 3); w7 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        xnn_prefetch_to_l1((const int8_t*) w7 + 128);
        vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 1
          case 1:
          {
            uint32x4_t vtmp0x0123 = vdupq_n_u32(0);
            uint32x4_t vtmp0x4567 = vdupq_n_u32(0);
            vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); w0 += 1;
            vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); w1 += 1;
            vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); w2 += 1;
            vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); w3 += 1;
            vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); w4 += 1;
            vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); w5 += 1;
            vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); w6 += 1;
            vtmp0x4567 = vld1q_lane_u32(w7, vtmp0x4567, 3); w7 += 1;
            vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 2
          case 2:
          {
            uint32x4x2_t vtmp01x0123;
            vtmp01x0123.val[0] = vdupq_n_u32(0);
            vtmp01x0123.val[1] = vdupq_n_u32(0);
            uint32x4x2_t vtmp01x4567;
            vtmp01x4567.val[0] = vdupq_n_u32(0);
            vtmp01x4567.val[1] = vdupq_n_u32(0);
            vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); w0 += 2;
            vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); w1 += 2;
            vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); w2 += 2;
            vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); w3 += 2;
            vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); w4 += 2;
            vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); w5 += 2;
            vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); w6 += 2;
            vtmp01x4567 = vld2q_lane_u32(w7, vtmp01x4567, 3); w7 += 2;
            vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4;
            break;
          }
          // KC remainder of 3
          case 3:
          {
            uint32x4x3_t vtmp012x0123;
            vtmp012x0123.val[0] = vdupq_n_u32(0);
            vtmp012x0123.val[1] = vdupq_n_u32(0);
            vtmp012x0123.val[2] = vdupq_n_u32(0);
            uint32x4x3_t vtmp012x4567;
            vtmp012x4567.val[0] = vdupq_n_u32(0);
            vtmp012x4567.val[1] = vdupq_n_u32(0);
            vtmp012x4567.val[2] = vdupq_n_u32(0);
            vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); w0 += 3;
            vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); w1 += 3;
            vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); w2 += 3;
            vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); w3 += 3;
            vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); w4 += 3;
            vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); w5 += 3;
            vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); w6 += 3;
            vtmp012x4567 = vld3q_lane_u32(w7, vtmp012x4567, 3); w7 += 3;
            vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
      w0 = w7;
    }

    // NC remainder (1..7)
    if XNN_UNLIKELY(n != 0) {
      assert(n >= 1);
      assert(n <= 7);
      if XNN_LIKELY(bias != NULL) {
        size_t nb = n;
        do {
          *packed_weights++  = *bias++;
        } while (--nb != 0);
        packed_weights += (8 - n);
      } else {
        const uint32x4_t vzero = vmovq_n_u32(0);
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
      }

      // NR remainder has less than 8 rows so last row is not loaded
      const uint32_t* w1 = w0 + kc;
      if XNN_UNPREDICTABLE(n < 2) {
        w1 = w0;
      }
      const uint32_t* w2 = w1 + kc;
      if XNN_UNPREDICTABLE(n <= 2) {
        w2 = w1;
      }
      const uint32_t* w3 = w2 + kc;
      if XNN_UNPREDICTABLE(n < 4) {
        w3 = w2;
      }
      const uint32_t* w4 = w3 + kc;
      if XNN_UNPREDICTABLE(n <= 4) {
        w4 = w3;
      }
      const uint32_t* w5 = w4 + kc;
      if XNN_UNPREDICTABLE(n < 6) {
        w5 = w4;
      }
      const uint32_t* w6 = w5 + kc;
      if XNN_UNPREDICTABLE(n <= 6) {
        w6 = w5;
      }

      // KC main loop multiple of 4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4;
        vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4;
        vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 1
          case 1:
          {
            uint32x4_t vtmp0x0123 = vdupq_n_u32(0);
            uint32x4_t vtmp0x4567 = vdupq_n_u32(0);
            vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0);
            vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1);
            vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2);
            vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3);
            vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0);
            vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1);
            vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2);
            vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 2
          case 2:
          {
            uint32x4x2_t vtmp01x0123;
            vtmp01x0123.val[0] = vdupq_n_u32(0);
            vtmp01x0123.val[1] = vdupq_n_u32(0);
            uint32x4x2_t vtmp01x4567;
            vtmp01x4567.val[0] = vdupq_n_u32(0);
            vtmp01x4567.val[1] = vdupq_n_u32(0);
            vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0);
            vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1);
            vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2);
            vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3);
            vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0);
            vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1);
            vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2);
            vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4;
            break;
          }
          // KC remainder of 3
          case 3:
          {
            uint32x4x3_t vtmp012x0123;
            vtmp012x0123.val[0] = vdupq_n_u32(0);
            vtmp012x0123.val[1] = vdupq_n_u32(0);
            vtmp012x0123.val[2] = vdupq_n_u32(0);
            uint32x4x3_t vtmp012x4567;
            vtmp012x4567.val[0] = vdupq_n_u32(0);
            vtmp012x4567.val[1] = vdupq_n_u32(0);
            vtmp012x4567.val[2] = vdupq_n_u32(0);
            vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0);
            vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1);
            vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2);
            vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3);
            vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0);
            vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1);
            vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2);
            vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
    }

    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x32_packw_gemm_goi_ukernel_x8s4__neon_ld4lane_x4_prfm(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const uint32_t* weights,
  const uint32_t* bias,
  uint32_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 8);
  assert(kr == 1);
  assert(sr == 4);
  assert(weights != NULL);
  assert(packed_weights != NULL);
  uint32x4x4_t vtmp0123x0123;
  vtmp0123x0123.val[0] = vdupq_n_u32(0);
  vtmp0123x0123.val[1] = vdupq_n_u32(0);
  vtmp0123x0123.val[2] = vdupq_n_u32(0);
  vtmp0123x0123.val[3] = vdupq_n_u32(0);
  uint32x4x4_t vtmp0123x4567;
  vtmp0123x4567.val[0] = vdupq_n_u32(0);
  vtmp0123x4567.val[1] = vdupq_n_u32(0);
  vtmp0123x4567.val[2] = vdupq_n_u32(0);
  vtmp0123x4567.val[3] = vdupq_n_u32(0);

  do {
    // NC main loop multiple of 8
    const uint32_t* w0 = weights;
    size_t n = nc;

    for (; n >= 8; n -= 8) {
      if XNN_LIKELY(bias != NULL) {
        uint32x4_t vb0 = vld1q_u32(bias); bias += 4;
        uint32x4_t vb4 = vld1q_u32(bias); bias += 4;
        vst1q_u32(packed_weights, vb0); packed_weights += 4;
        vst1q_u32(packed_weights, vb4); packed_weights += 4;
      } else {
        const uint32x4_t vzero = vmovq_n_u32(0);
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
      }

      const uint32_t* w1 = w0 + kc;
      const uint32_t* w2 = w1 + kc;
      const uint32_t* w3 = w2 + kc;
      const uint32_t* w4 = w3 + kc;
      const uint32_t* w5 = w4 + kc;
      const uint32_t* w6 = w5 + kc;
      const uint32_t* w7 = w6 + kc;
      xnn_prefetch_to_l1((const int8_t*) w0);
      xnn_prefetch_to_l1((const int8_t*) w0 + 64);
      xnn_prefetch_to_l1((const int8_t*) w1);
      xnn_prefetch_to_l1((const int8_t*) w1 + 64);
      xnn_prefetch_to_l1((const int8_t*) w2);
      xnn_prefetch_to_l1((const int8_t*) w2 + 64);
      xnn_prefetch_to_l1((const int8_t*) w3);
      xnn_prefetch_to_l1((const int8_t*) w3 + 64);
      xnn_prefetch_to_l1((const int8_t*) w4);
      xnn_prefetch_to_l1((const int8_t*) w4 + 64);
      xnn_prefetch_to_l1((const int8_t*) w5);
      xnn_prefetch_to_l1((const int8_t*) w5 + 64);
      xnn_prefetch_to_l1((const int8_t*) w6);
      xnn_prefetch_to_l1((const int8_t*) w6 + 64);
      xnn_prefetch_to_l1((const int8_t*) w7);
      xnn_prefetch_to_l1((const int8_t*) w7 + 64);

      // KC main loop multiple of 4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        uint32x4_t vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w7, vtmp0123x4567, 3); w7 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        xnn_prefetch_to_l1((const int8_t*) w7 + 128);
        vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 1
          case 1:
          {
            uint32x4_t vtmp0x0123 = vdupq_n_u32(0);
            uint32x4_t vtmp0x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp1x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp1x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); w0 += 1;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); w1 += 1;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); w2 += 1;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); w3 += 1;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); w4 += 1;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); w5 += 1;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); w6 += 1;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w7, vtmp0x4567, 3); w7 += 1;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp1x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp1x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 2
          case 2:
          {
            uint32x4x2_t vtmp01x0123;
            vtmp01x0123.val[0] = vdupq_n_u32(0);
            vtmp01x0123.val[1] = vdupq_n_u32(0);
            uint32x4x2_t vtmp01x4567;
            vtmp01x4567.val[0] = vdupq_n_u32(0);
            vtmp01x4567.val[1] = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); w0 += 2;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); w1 += 2;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); w2 += 2;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); w3 += 2;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); w4 += 2;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); w5 += 2;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); w6 += 2;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w7, vtmp01x4567, 3); w7 += 2;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 3
          case 3:
          {
            uint32x4x3_t vtmp012x0123;
            vtmp012x0123.val[0] = vdupq_n_u32(0);
            vtmp012x0123.val[1] = vdupq_n_u32(0);
            vtmp012x0123.val[2] = vdupq_n_u32(0);
            uint32x4x3_t vtmp012x4567;
            vtmp012x4567.val[0] = vdupq_n_u32(0);
            vtmp012x4567.val[1] = vdupq_n_u32(0);
            vtmp012x4567.val[2] = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); w0 += 3;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); w1 += 3;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); w2 += 3;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); w3 += 3;
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); w4 += 3;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); w5 += 3;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); w6 += 3;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w7, vtmp012x4567, 3); w7 += 3;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
      w0 = w7;
    }

    // NC remainder (1..7)
    if XNN_UNLIKELY(n != 0) {
      assert(n >= 1);
      assert(n <= 7);
      if XNN_LIKELY(bias != NULL) {
        size_t nb = n;
        do {
          *packed_weights++  = *bias++;
        } while (--nb != 0);
        packed_weights += (8 - n);
      } else {
        const uint32x4_t vzero = vmovq_n_u32(0);
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
        vst1q_u32(packed_weights, vzero); packed_weights += 4;
      }

      // NR remainder has less than 8 rows so last row is not loaded
      const uint32_t* w1 = w0 + kc;
      if XNN_UNPREDICTABLE(n < 2) {
        w1 = w0;
      }
      const uint32_t* w2 = w1 + kc;
      if XNN_UNPREDICTABLE(n <= 2) {
        w2 = w1;
      }
      const uint32_t* w3 = w2 + kc;
      if XNN_UNPREDICTABLE(n < 4) {
        w3 = w2;
      }
      const uint32_t* w4 = w3 + kc;
      if XNN_UNPREDICTABLE(n <= 4) {
        w4 = w3;
      }
      const uint32_t* w5 = w4 + kc;
      if XNN_UNPREDICTABLE(n < 6) {
        w5 = w4;
      }
      const uint32_t* w6 = w5 + kc;
      if XNN_UNPREDICTABLE(n <= 6) {
        w6 = w5;
      }

      // KC main loop multiple of 4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        uint32x4_t vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4;
        vsrtmp = vtmp0123x0123.val[3];
        vtmp0123x0123.val[3] = vtmp0123x0123.val[2];
        vtmp0123x0123.val[2] = vtmp0123x0123.val[1];
        vtmp0123x0123.val[1] = vtmp0123x0123.val[0];
        vtmp0123x0123.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        vsrtmp = vtmp0123x4567.val[3];
        vtmp0123x4567.val[3] = vtmp0123x4567.val[2];
        vtmp0123x4567.val[2] = vtmp0123x4567.val[1];
        vtmp0123x4567.val[1] = vtmp0123x4567.val[0];
        vtmp0123x4567.val[0] = vsrtmp;
        xnn_prefetch_to_l1((const int8_t*) w0 + 128);
        xnn_prefetch_to_l1((const int8_t*) w1 + 128);
        xnn_prefetch_to_l1((const int8_t*) w2 + 128);
        xnn_prefetch_to_l1((const int8_t*) w3 + 128);
        xnn_prefetch_to_l1((const int8_t*) w4 + 128);
        xnn_prefetch_to_l1((const int8_t*) w5 + 128);
        xnn_prefetch_to_l1((const int8_t*) w6 + 128);
        vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4;
        vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4;
      }

      // KC remainder of 1..3
      // Same as main loop but ld1, ld2 or ld3
      if XNN_UNLIKELY(k != 0) {
        assert(k >= 1);
        assert(k <= 3);
        switch (k) {
          // KC remainder of 1
          case 1:
          {
            uint32x4_t vtmp0x0123 = vdupq_n_u32(0);
            uint32x4_t vtmp0x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp1x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp1x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vsrtmp1x0123;
            vsrtmp1x0123 = vtmp0x0123;
            vtmp0x0123 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vsrtmp1x4567;
            vsrtmp1x4567 = vtmp0x4567;
            vtmp0x4567 = vsrtmp;
            vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp1x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp1x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 2
          case 2:
          {
            uint32x4x2_t vtmp01x0123;
            vtmp01x0123.val[0] = vdupq_n_u32(0);
            vtmp01x0123.val[1] = vdupq_n_u32(0);
            uint32x4x2_t vtmp01x4567;
            vtmp01x4567.val[0] = vdupq_n_u32(0);
            vtmp01x4567.val[1] = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vsrtmp2x0123;
            vsrtmp2x0123 = vtmp01x0123.val[1];
            vtmp01x0123.val[1] = vtmp01x0123.val[0];
            vtmp01x0123.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vsrtmp2x4567;
            vsrtmp2x4567 = vtmp01x4567.val[1];
            vtmp01x4567.val[1] = vtmp01x4567.val[0];
            vtmp01x4567.val[0] = vsrtmp;
            vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          // KC remainder of 3
          case 3:
          {
            uint32x4x3_t vtmp012x0123;
            vtmp012x0123.val[0] = vdupq_n_u32(0);
            vtmp012x0123.val[1] = vdupq_n_u32(0);
            vtmp012x0123.val[2] = vdupq_n_u32(0);
            uint32x4x3_t vtmp012x4567;
            vtmp012x4567.val[0] = vdupq_n_u32(0);
            vtmp012x4567.val[1] = vdupq_n_u32(0);
            vtmp012x4567.val[2] = vdupq_n_u32(0);
            uint32x4_t vsrtmp;
            uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0);
            uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0);
            vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3);
            vsrtmp = vsrtmp3x0123;
            vsrtmp3x0123 = vtmp012x0123.val[2];
            vtmp012x0123.val[2] = vtmp012x0123.val[1];
            vtmp012x0123.val[1] = vtmp012x0123.val[0];
            vtmp012x0123.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2);
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vsrtmp = vsrtmp3x4567;
            vsrtmp3x4567 = vtmp012x4567.val[2];
            vtmp012x4567.val[2] = vtmp012x4567.val[1];
            vtmp012x4567.val[1] = vtmp012x4567.val[0];
            vtmp012x4567.val[0] = vsrtmp;
            vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4;
            vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4;
            break;
          }
          default:
            XNN_UNREACHABLE;
        }
      }
      packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes);
    }

    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x32_packx_ukernel_4x__neon_st4_x4_prfm(
    size_t m,
    size_t k,
    const uint32_t* x,
    size_t x_stride,
    uint32_t* restrict y)
{
  assert(m != 0);
  assert(m <= 4);
  assert(k != 0);
  assert(x != NULL);
  assert(y != NULL);

  const uint32_t* x0 = x;
  const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
  if XNN_UNPREDICTABLE(m < 2) {
    x1 = x0;
  }
  const uint32_t* x2 = (const uint32_t*) ((uintptr_t) x1 + x_stride);
  if XNN_UNPREDICTABLE(m <= 2) {
    x2 = x1;
  }
  const uint32_t* x3 = (const uint32_t*) ((uintptr_t) x2 + x_stride);
  if XNN_UNPREDICTABLE(m != 4) {
    x3 = x2;
  }

  for (; k >= 4; k -= 4) {
    uint32x4x4_t vx0123x0123;
    vx0123x0123.val[0] = vld1q_u32(x0); x0 += 4;
    vx0123x0123.val[1] = vld1q_u32(x1); x1 += 4;
    vx0123x0123.val[2] = vld1q_u32(x2); x2 += 4;
    vx0123x0123.val[3] = vld1q_u32(x3); x3 += 4;
    xnn_prefetch_to_l1((const int8_t*) x0 + 128);
    xnn_prefetch_to_l1((const int8_t*) x1 + 128);
    xnn_prefetch_to_l1((const int8_t*) x2 + 128);
    xnn_prefetch_to_l1((const int8_t*) x3 + 128);
    vst4q_u32(y, vx0123x0123); y += 16;
  }

  if XNN_UNLIKELY(k != 0) {
    uint32x4_t vt0123 = vdupq_n_u32(0);
    do {
      vt0123 = vld1q_lane_u32(x0, vt0123, 0); x0 += 1;
      vt0123 = vld1q_lane_u32(x1, vt0123, 1); x1 += 1;
      vt0123 = vld1q_lane_u32(x2, vt0123, 2); x2 += 1;
      vt0123 = vld1q_lane_u32(x3, vt0123, 3); x3 += 1;
      xnn_prefetch_to_l1((const int8_t*) x0 + 128);
      xnn_prefetch_to_l1((const int8_t*) x1 + 128);
      xnn_prefetch_to_l1((const int8_t*) x2 + 128);
      xnn_prefetch_to_l1((const int8_t*) x3 + 128);
      vst1q_u32(y, vt0123); y += 4;
    } while (--k != 0);
  }
}

void xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon(
    const uint32_t* input,
    uint32_t* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height,
    const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_stride >= block_height * sizeof(uint32_t));
  assert(input_stride >= block_width * sizeof(uint32_t));

  const size_t tile_height = 4;
  const size_t tile_width = 4;
  const size_t tile_hbytes = tile_height * sizeof(uint32_t);
  const size_t tile_wbytes = tile_width * sizeof(uint32_t);
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t) - tile_hbytes;

  const uint32_t* i0 = input;
  uint32_t* o = (uint32_t*) ((uintptr_t) output - tile_hbytes);
  const size_t minus_output_stride = -output_stride;

  do {
    const size_t rem = min(block_width - 1, 3);
    const size_t oN_stride = rem * output_stride;
    const size_t oN_offset = oN_stride + tile_hbytes;
    size_t bh = block_height;
    for (; bh >= 4; bh -= 4) {
      const uint32x4_t v2_0 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride);
      const uint32x4_t v2_1 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride);
      const uint32x4_t v2_2 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride);
      const uint32x4_t v2_3 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride);

      const uint32x4x2_t v1_0 = vzipq_u32(v2_0, v2_2);
      const uint32x4x2_t v1_1 = vzipq_u32(v2_1, v2_3);

      const uint32x4x2_t v0_0 = vzipq_u32(v1_0.val[0], v1_1.val[0]);
      const uint32x4x2_t v0_1 = vzipq_u32(v1_0.val[1], v1_1.val[1]);

      o = (uint32_t*) ((uintptr_t) o + oN_offset);
      vst1q_u32(o, v0_1.val[1]);
      if XNN_UNPREDICTABLE(block_width > 3) {
        o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u32(o, v0_1.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 3) {
        o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u32(o, v0_0.val[1]);
      if XNN_UNPREDICTABLE(block_width > 1) {
        o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u32(o, v0_0.val[0]);
    }
    o = (uint32_t*) ((uintptr_t) o + tile_hbytes);

    if (bh != 0) {
      const uint32x4_t v2_0 = vld1q_u32(i0);
      const uint32_t *i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride);
      if XNN_UNPREDICTABLE(bh < 2) {
        i1 = i0;
      }
      const uint32x4_t v2_1 = vld1q_u32(i1);
      const uint32_t *i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 2) {
        i2 = i1;
      }
      const uint32x4_t v2_2 = vld1q_u32(i2);
      const uint32x4_t v2_3 = vmovq_n_u32(0);

      const uint32x4x2_t v1_0 = vzipq_u32(v2_0, v2_2);
      const uint32x4x2_t v1_1 = vzipq_u32(v2_1, v2_3);

      const uint32x4x2_t v0_0 = vzipq_u32(v1_0.val[0], v1_1.val[0]);
      const uint32x4x2_t v0_1 = vzipq_u32(v1_0.val[1], v1_1.val[1]);

      uint32x2_t v0_low = vget_low_u32(v0_0.val[0]);
      uint32x2_t v1_low = vget_low_u32(v0_0.val[1]);
      uint32x2_t v2_low = vget_low_u32(v0_1.val[0]);
      uint32x2_t v3_low = vget_low_u32(v0_1.val[1]);

      if (bh & 2) {
        o = (uint32_t*) ((uintptr_t) o + oN_stride);
        vst1_u32(o, v3_low);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u32(o, v2_low);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u32(o, v1_low);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u32(o, v0_low); o += 2;
        v0_low = vget_high_u32(v0_0.val[0]);
        v1_low = vget_high_u32(v0_0.val[1]);
        v2_low = vget_high_u32(v0_1.val[0]);
        v3_low = vget_high_u32(v0_1.val[1]);
      }

      if (bh & 1) {
        o = (uint32_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u32(o, v3_low, 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32(o, v2_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32(o, v1_low, 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint32_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32(o, v0_low, 0);
      }
    }

    i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset);
    o = (uint32_t*) ((uintptr_t) o + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}

void xnn_x32_unpool_ukernel__neon(
    size_t kernel_elements,
    size_t channels,
    uint32_t fill,
    const uint32_t* input,
    const uint32_t* index,
    uint32_t** output)
{
  // Pre-initialize outputs with constant.
  const uint32x4_t vfill = vdupq_n_u32(fill);
  uint32_t** os = output;
  do {
    uint32_t* o = *os++;
    size_t c = channels;
    for (; c >= 4; c -= 4) {
      vst1q_u32(o, vfill); o += 4;
    }
    if (c != 0) {
      if (c & 2) {
        vst1_u32(o, vget_low_u32(vfill)); o += 2;
      }
      if (c & 1) {
        vst1q_lane_u32(o, vfill, 0);
      }
    }
  } while (--kernel_elements != 0);

  // Copy indexed elements to output.
  size_t offset = 0;
  do {
    const uint32_t i = *index++;
    *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++;
    offset += sizeof(uint32_t);
  } while (--channels != 0);
}

void xnn_x32_zip_x2_ukernel__neon(
    size_t n,
    const uint32_t* input,
    uint32_t* output)
{
  assert(n != 0);
  assert(n % 4 == 0);

  const uint32_t* x = input;
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
  uint32_t* o = output;

  while (n >= 16) {
    uint32x4x2_t vxy;
    vxy.val[0] = vld1q_u32(x); x += 4;
    vxy.val[1] = vld1q_u32(y); y += 4;
    vst2q_u32(o, vxy); o += 8;
    n -= 16;
  }
  if XNN_UNLIKELY(n != 0) {
    if (n & 8) {
      uint32x2x2_t vxy;
      vxy.val[0] = vld1_u32(x); x += 2;
      vxy.val[1] = vld1_u32(y); y += 2;
      vst2_u32(o, vxy); o += 4;
    }
    if (n & 4) {
      uint32x2_t vxy = vld1_dup_u32(x);
      vxy = vld1_lane_u32(y, vxy, 1);
      vst1_u32(o, vxy);
    }
  }
}

void xnn_x32_zip_x3_ukernel__neon(
    size_t n,
    const uint32_t* input,
    uint32_t* output)
{
  assert(n != 0);
  assert(n % 4 == 0);

  const uint32_t* x = input;
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
  uint32_t* o = output;

  while (n >= 16) {
    uint32x4x3_t vxyz;
    vxyz.val[0] = vld1q_u32(x); x += 4;
    vxyz.val[1] = vld1q_u32(y); y += 4;
    vxyz.val[2] = vld1q_u32(z); z += 4;
    vst3q_u32(o, vxyz); o += 12;
    n -= 16;
  }
  if XNN_UNLIKELY(n != 0) {
    if (n & 8) {
      uint32x2x3_t vxyz;
      vxyz.val[0] = vld1_u32(x); x += 2;
      vxyz.val[1] = vld1_u32(y); y += 2;
      vxyz.val[2] = vld1_u32(z); z += 2;
      vst3_u32(o, vxyz); o += 6;
    }
    if (n & 4) {
      uint32x2_t vxy = vld1_dup_u32(x);
      const uint32x2_t vz = vld1_dup_u32(z);
      vxy = vld1_lane_u32(y, vxy, 1);
      vst1_u32(o, vxy); o += 2;
      vst1_lane_u32(o, vz, 0);
    }
  }
}

void xnn_x32_zip_x4_ukernel__neon(
    size_t n,
    const uint32_t* input,
    uint32_t* output)
{
  assert(n != 0);
  assert(n % 4 == 0);

  const uint32_t* x = input;
  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
  const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
  uint32_t* o = output;

  while (n >= 16) {
    uint32x4x4_t vxyzw;
    vxyzw.val[0] = vld1q_u32(x); x += 4;
    vxyzw.val[1] = vld1q_u32(y); y += 4;
    vxyzw.val[2] = vld1q_u32(z); z += 4;
    vxyzw.val[3] = vld1q_u32(w); w += 4;
    vst4q_u32(o, vxyzw); o += 16;
    n -= 16;
  }
  if XNN_UNLIKELY(n != 0) {
    if (n & 8) {
      uint32x2x4_t vxyzw;
      vxyzw.val[0] = vld1_u32(x); x += 2;
      vxyzw.val[1] = vld1_u32(y); y += 2;
      vxyzw.val[2] = vld1_u32(z); z += 2;
      vxyzw.val[3] = vld1_u32(w); w += 2;
      vst4_u32(o, vxyzw); o += 8;
    }
    if (n & 4) {
      uint32x4_t vxyzw = vld1q_dup_u32(x);
      vxyzw = vld1q_lane_u32(y, vxyzw, 1);
      vxyzw = vld1q_lane_u32(z, vxyzw, 2);
      vxyzw = vld1q_lane_u32(w, vxyzw, 3);
      vst1q_u32(o, vxyzw);
    }
  }
}

void xnn_x32_zip_xm_ukernel__neon(
    size_t n,
    size_t m,
    const uint32_t* input,
    uint32_t* output)
{
  assert(n != 0);
  assert(n % 4 == 0);
  assert(m >= 4);

  const uint32_t* w = input;
  const size_t group_increment = m * 4;
  const size_t input_increment = n * 3;
  const size_t output_increment = 16 - m * n;
  const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1));
  uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16));

  for (size_t i = 0; i < m; i += 4) {
    w = (const uint32_t*) ((uintptr_t) w + input_increment);
    if (w >= last_input) {
      w = last_input;
    }
    const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n);
    const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n);
    const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n);

    size_t k = n;
    while (k >= 16) {
      const uint32x4_t vx = vld1q_u32(x); x += 4;
      const uint32x4_t vy = vld1q_u32(y); y += 4;
      const uint32x4_t vz = vld1q_u32(z); z += 4;
      const uint32x4_t vw = vld1q_u32(w); w += 4;

      const uint32x4x2_t vxy = vzipq_u32(vx, vy);
      const uint32x4x2_t vzw = vzipq_u32(vz, vw);

      vst1_u32(output, vget_low_u32(vxy.val[0]));
      vst1_u32(output + 2, vget_low_u32(vzw.val[0]));
      output = (uint32_t*) ((uintptr_t) output + group_increment);

      vst1_u32(output, vget_high_u32(vxy.val[0]));
      vst1_u32(output + 2, vget_high_u32(vzw.val[0]));
      output = (uint32_t*) ((uintptr_t) output + group_increment);

      vst1_u32(output, vget_low_u32(vxy.val[1]));
      vst1_u32(output + 2, vget_low_u32(vzw.val[1]));
      output = (uint32_t*) ((uintptr_t) output + group_increment);

      vst1_u32(output, vget_high_u32(vxy.val[1]));
      vst1_u32(output + 2, vget_high_u32(vzw.val[1]));
      output = (uint32_t*) ((uintptr_t) output + group_increment);

      k -= 16;
    }
    if XNN_UNLIKELY(k != 0) {
      if (k & 8) {
        const uint32x2_t vx = vld1_u32(x); x += 2;
        const uint32x2_t vy = vld1_u32(y); y += 2;
        const uint32x2_t vz = vld1_u32(z); z += 2;
        const uint32x2_t vw = vld1_u32(w); w += 2;

        const uint32x2x2_t vxy = vzip_u32(vx, vy);
        const uint32x2x2_t vzw = vzip_u32(vz, vw);

        vst1_u32(output, vxy.val[0]);
        vst1_u32(output + 2, vzw.val[0]);
        output = (uint32_t*) ((uintptr_t) output + group_increment);

        vst1_u32(output, vxy.val[1]);
        vst1_u32(output + 2, vzw.val[1]);
        output = (uint32_t*) ((uintptr_t) output + group_increment);
      }
      if (k & 4) {
        const uint32x2_t vx = vld1_dup_u32(x);
        const uint32x2_t vz = vld1_dup_u32(z);
        const uint32x2_t vxy = vld1_lane_u32(y, vx, 1);
        const uint32x2_t vzw = vld1_lane_u32(w, vz, 1); w += 1;

        vst1_u32(output, vxy);
        vst1_u32(output + 2, vzw);
        output = (uint32_t*) ((uintptr_t) output + group_increment);
      }
    }
    output = (uint32_t*) ((uintptr_t) output + output_increment);
    if (output > last_output) {
      output = last_output;
    }
  }
}

void xnn_x8_packw_gemm_goi_ukernel_x8__scalar_int_x4(
  size_t g,
  size_t nc,
  size_t kc,
  size_t nr,
  size_t kr,
  size_t sr,
  const int8_t* weights,
  const uint32_t* bias,
  int8_t* packed_weights,
  size_t extra_bytes,
  const void* params)
{
  assert(g != 0);
  assert(nc != 0);
  assert(kc != 0);
  assert(nr == 8);   // This kernel is for NR=8
  assert(kr == 1);
  assert(sr == 1);
  assert(weights != NULL);
  assert(packed_weights != NULL);

  int8_t* out = (int8_t*) packed_weights;
  const uint32_t* b = (const uint32_t*) bias;

  do {
    // NC main loop multiple of 8
    const int8_t* w0 = (const int8_t*) weights;
    size_t n = nc;
    for (;n >= 8; n -= 8) {
      if XNN_LIKELY(b != NULL) {
        ((uint32_t*) out)[0] = b[0];
        ((uint32_t*) out)[1] = b[1];
        ((uint32_t*) out)[2] = b[2];
        ((uint32_t*) out)[3] = b[3];
        ((uint32_t*) out)[4] = b[4];
        ((uint32_t*) out)[5] = b[5];
        ((uint32_t*) out)[6] = b[6];
        ((uint32_t*) out)[7] = b[7];
        b += 8;
      } else {
        ((uint32_t*) out)[0] = 0;
        ((uint32_t*) out)[1] = 0;
        ((uint32_t*) out)[2] = 0;
        ((uint32_t*) out)[3] = 0;
        ((uint32_t*) out)[4] = 0;
        ((uint32_t*) out)[5] = 0;
        ((uint32_t*) out)[6] = 0;
        ((uint32_t*) out)[7] = 0;
      }
      out += 8 * sizeof(uint32_t);

      const int8_t* w1 = w0 + kc;
      const int8_t* w2 = w1 + kc;
      const int8_t* w3 = w2 + kc;
      const int8_t* w4 = w3 + kc;
      const int8_t* w5 = w4 + kc;
      const int8_t* w6 = w5 + kc;
      const int8_t* w7 = w6 + kc;

      // KC main loop multiple of 8x4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        const int8_t v00 = w0[0];
        const int8_t v01 = w0[1];
        const int8_t v02 = w0[2];
        const int8_t v03 = w0[3];
        w0 += 4;
        const int8_t v10 = w1[0];
        const int8_t v11 = w1[1];
        const int8_t v12 = w1[2];
        const int8_t v13 = w1[3];
        w1 += 4;
        const int8_t v20 = w2[0];
        const int8_t v21 = w2[1];
        const int8_t v22 = w2[2];
        const int8_t v23 = w2[3];
        w2 += 4;
        const int8_t v30 = w3[0];
        const int8_t v31 = w3[1];
        const int8_t v32 = w3[2];
        const int8_t v33 = w3[3];
        w3 += 4;
        const int8_t v40 = w4[0];
        const int8_t v41 = w4[1];
        const int8_t v42 = w4[2];
        const int8_t v43 = w4[3];
        w4 += 4;
        const int8_t v50 = w5[0];
        const int8_t v51 = w5[1];
        const int8_t v52 = w5[2];
        const int8_t v53 = w5[3];
        w5 += 4;
        const int8_t v60 = w6[0];
        const int8_t v61 = w6[1];
        const int8_t v62 = w6[2];
        const int8_t v63 = w6[3];
        w6 += 4;
        const int8_t v70 = w7[0];
        const int8_t v71 = w7[1];
        const int8_t v72 = w7[2];
        const int8_t v73 = w7[3];
        w7 += 4;
        out[0] = v00;
        out[1] = v10;
        out[2] = v20;
        out[3] = v30;
        out[4] = v40;
        out[5] = v50;
        out[6] = v60;
        out[7] = v70;
        out[8] = v01;
        out[9] = v11;
        out[10] = v21;
        out[11] = v31;
        out[12] = v41;
        out[13] = v51;
        out[14] = v61;
        out[15] = v71;
        out[16] = v02;
        out[17] = v12;
        out[18] = v22;
        out[19] = v32;
        out[20] = v42;
        out[21] = v52;
        out[22] = v62;
        out[23] = v72;
        out[24] = v03;
        out[25] = v13;
        out[26] = v23;
        out[27] = v33;
        out[28] = v43;
        out[29] = v53;
        out[30] = v63;
        out[31] = v73;
        out += 32;
      }

      // KC remainder
      for (; k != 0; --k) {
        const int8_t v0 = *w0++;
        out[0] = v0;
        const int8_t v1 = *w1++;
        out[1] = v1;
        const int8_t v2 = *w2++;
        out[2] = v2;
        const int8_t v3 = *w3++;
        out[3] = v3;
        const int8_t v4 = *w4++;
        out[4] = v4;
        const int8_t v5 = *w5++;
        out[5] = v5;
        const int8_t v6 = *w6++;
        out[6] = v6;
        const int8_t v7 = *w7++;
        out[7] = v7;
        out += 8;
      }
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
      w0 = w7;
    }

    // NC remainder (1..7)
    if XNN_UNLIKELY(n != 0) {
      if XNN_LIKELY(b != NULL) {
        size_t nb = n;
        do {
          *((uint32_t*) out) = *b++;
          out += sizeof(uint32_t);
        } while (--nb != 0);
      } else {
        size_t nb = n;
        do {
          *((uint32_t*) out) = 0;
          out += sizeof(uint32_t);
        } while (--nb != 0);
      }
      out += (8 - n) * sizeof(uint32_t);

      // NR remainder has less than 8 rows so last row is not loaded
      const int8_t* w1 = w0 + kc;
      if XNN_UNPREDICTABLE(n < 2) {
        w1 = w0;
      }
      const int8_t* w2 = w1 + kc;
      if XNN_UNPREDICTABLE(n <= 2) {
        w2 = w1;
      }
      const int8_t* w3 = w2 + kc;
      if XNN_UNPREDICTABLE(n < 4) {
        w3 = w2;
      }
      const int8_t* w4 = w3 + kc;
      if XNN_UNPREDICTABLE(n <= 4) {
        w4 = w3;
      }
      const int8_t* w5 = w4 + kc;
      if XNN_UNPREDICTABLE(n < 6) {
        w5 = w4;
      }
      const int8_t* w6 = w5 + kc;
      if XNN_UNPREDICTABLE(n <= 6) {
        w6 = w5;
      }

      // KC main loop multiple of 8x4
      size_t k = kc;
      for (; k >= 4; k -= 4) {
        const int8_t v00 = w0[0];
        const int8_t v01 = w0[1];
        const int8_t v02 = w0[2];
        const int8_t v03 = w0[3];
        w0 += 4;
        const int8_t v10 = w1[0];
        const int8_t v11 = w1[1];
        const int8_t v12 = w1[2];
        const int8_t v13 = w1[3];
        w1 += 4;
        const int8_t v20 = w2[0];
        const int8_t v21 = w2[1];
        const int8_t v22 = w2[2];
        const int8_t v23 = w2[3];
        w2 += 4;
        const int8_t v30 = w3[0];
        const int8_t v31 = w3[1];
        const int8_t v32 = w3[2];
        const int8_t v33 = w3[3];
        w3 += 4;
        const int8_t v40 = w4[0];
        const int8_t v41 = w4[1];
        const int8_t v42 = w4[2];
        const int8_t v43 = w4[3];
        w4 += 4;
        const int8_t v50 = w5[0];
        const int8_t v51 = w5[1];
        const int8_t v52 = w5[2];
        const int8_t v53 = w5[3];
        w5 += 4;
        const int8_t v60 = w6[0];
        const int8_t v61 = w6[1];
        const int8_t v62 = w6[2];
        const int8_t v63 = w6[3];
        w6 += 4;
        out[0] = v00;
        out[1] = v10;
        out[2] = v20;
        out[3] = v30;
        out[4] = v40;
        out[5] = v50;
        out[6] = v60;
        out[8] = v01;
        out[9] = v11;
        out[10] = v21;
        out[11] = v31;
        out[12] = v41;
        out[13] = v51;
        out[14] = v61;
        out[16] = v02;
        out[17] = v12;
        out[18] = v22;
        out[19] = v32;
        out[20] = v42;
        out[21] = v52;
        out[22] = v62;
        out[24] = v03;
        out[25] = v13;
        out[26] = v23;
        out[27] = v33;
        out[28] = v43;
        out[29] = v53;
        out[30] = v63;
        out += 32;
      }

      // KC remainder of 1..3
      for (; k != 0; --k) {
        const int8_t v0 = *w0++;
        out[0] = v0;
        const int8_t v1 = *w1++;
        out[1] = v1;
        const int8_t v2 = *w2++;
        out[2] = v2;
        const int8_t v3 = *w3++;
        out[3] = v3;
        const int8_t v4 = *w4++;
        out[4] = v4;
        const int8_t v5 = *w5++;
        out[5] = v5;
        const int8_t v6 = *w6++;
        out[6] = v6;
        out += 8;
      }
      out = (int8_t*) ((uintptr_t) out + extra_bytes);
    }
    weights += nc * kc;
  } while (--g != 0);
}

void xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon(
    const uint8_t* input,
    uint8_t* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height,
    const union xnn_x8_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
  assert(output_stride >= block_height * sizeof(uint8_t));
  assert(input_stride >= block_width * sizeof(uint8_t));

  const size_t tile_height = 16;
  const size_t tile_width = 16;
  const size_t tile_hbytes = tile_height * sizeof(uint8_t);
  const size_t tile_wbytes = tile_width * sizeof(uint8_t);
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;

  const uint8_t* i0 = input;
  uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
  const size_t minus_output_stride = -output_stride;

  do {
    const size_t rem = min(block_width - 1, 15);
    const size_t oN_stride = rem * output_stride;
    const size_t oN_offset = oN_stride + tile_hbytes;
    size_t bh = block_height;
    for (; bh >= 16; bh -= 16) {
      const uint8x16_t v4_0 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_1 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_2 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_3 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_4 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_5 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_6 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_7 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_8 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_9 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_10 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_11 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_12 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_13 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_14 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
      const uint8x16_t v4_15 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);

      const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
      const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
      const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
      const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
      const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
      const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
      const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
      const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);

      const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
      const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
      const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
      const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
      const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
      const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
      const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
      const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
      const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
      const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
      const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
      const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
      const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
      const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
      const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
      const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
      const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
      const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
      const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
      const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
      const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
      const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
      const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
      const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);

      o = (uint8_t*) ((uintptr_t) o + oN_offset);
      vst1q_u8(o, v0_7.val[1]);
      if XNN_UNPREDICTABLE(block_width > 15) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_7.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 15) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_6.val[1]);
      if XNN_UNPREDICTABLE(block_width > 13) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_6.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 13) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_5.val[1]);
      if XNN_UNPREDICTABLE(block_width > 11) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_5.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 11) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_4.val[1]);
      if XNN_UNPREDICTABLE(block_width > 9) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_4.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 9) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_3.val[1]);
      if XNN_UNPREDICTABLE(block_width > 7) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_3.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 7) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_2.val[1]);
      if XNN_UNPREDICTABLE(block_width > 5) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_2.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 5) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_1.val[1]);
      if XNN_UNPREDICTABLE(block_width > 3) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_1.val[0]);
      if XNN_UNPREDICTABLE(block_width >= 3) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_0.val[1]);
      if XNN_UNPREDICTABLE(block_width > 1) {
        o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
      }
      vst1q_u8(o, v0_0.val[0]);
    }
    o = (uint8_t*) ((uintptr_t) o + tile_hbytes);

    if (bh != 0) {
      const uint8x16_t v4_0 = vld1q_u8(i0);
      const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
      if XNN_UNPREDICTABLE(bh < 2) {
        i1 = i0;
      }
      const uint8x16_t v4_1 = vld1q_u8(i1);
      const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 2) {
        i2 = i1;
      }
      const uint8x16_t v4_2 = vld1q_u8(i2);
      const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
      if XNN_UNPREDICTABLE(bh < 4) {
        i3 = i2;
      }
      const uint8x16_t v4_3 = vld1q_u8(i3);
      const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 4) {
        i4 = i3;
      }
      const uint8x16_t v4_4 = vld1q_u8(i4);
      const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
      if XNN_UNPREDICTABLE(bh < 6) {
        i5 = i4;
      }
      const uint8x16_t v4_5 = vld1q_u8(i5);
      const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 6) {
        i6 = i5;
      }
      const uint8x16_t v4_6 = vld1q_u8(i6);
      const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
      if XNN_UNPREDICTABLE(bh < 8) {
        i7 = i6;
      }
      const uint8x16_t v4_7 = vld1q_u8(i7);
      const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 8) {
        i8 = i7;
      }
      const uint8x16_t v4_8 = vld1q_u8(i8);
      const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
      if XNN_UNPREDICTABLE(bh < 10) {
        i9 = i8;
      }
      const uint8x16_t v4_9 = vld1q_u8(i9);
      const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 10) {
        i10 = i9;
      }
      const uint8x16_t v4_10 = vld1q_u8(i10);
      const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
      if XNN_UNPREDICTABLE(bh < 12) {
        i11 = i10;
      }
      const uint8x16_t v4_11 = vld1q_u8(i11);
      const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 12) {
        i12 = i11;
      }
      const uint8x16_t v4_12 = vld1q_u8(i12);
      const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
      if XNN_UNPREDICTABLE(bh < 14) {
        i13 = i12;
      }
      const uint8x16_t v4_13 = vld1q_u8(i13);
      const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
      if XNN_UNPREDICTABLE(bh <= 14) {
        i14 = i13;
      }
      const uint8x16_t v4_14 = vld1q_u8(i14);
      const uint8x16_t v4_15 = vmovq_n_u8(0);

      const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
      const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
      const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
      const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
      const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
      const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
      const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
      const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);

      const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
      const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
      const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
      const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
      const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
      const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
      const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
      const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
      const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
      const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
      const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
      const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
      const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
      const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
      const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
      const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
      const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
      const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
      const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
      const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
      const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
      const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
      const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
      const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);

      uint8x8_t v0_low = vget_low_u8(v0_0.val[0]);
      uint8x8_t v1_low = vget_low_u8(v0_0.val[1]);
      uint8x8_t v2_low = vget_low_u8(v0_1.val[0]);
      uint8x8_t v3_low = vget_low_u8(v0_1.val[1]);
      uint8x8_t v4_low = vget_low_u8(v0_2.val[0]);
      uint8x8_t v5_low = vget_low_u8(v0_2.val[1]);
      uint8x8_t v6_low = vget_low_u8(v0_3.val[0]);
      uint8x8_t v7_low = vget_low_u8(v0_3.val[1]);
      uint8x8_t v8_low = vget_low_u8(v0_4.val[0]);
      uint8x8_t v9_low = vget_low_u8(v0_4.val[1]);
      uint8x8_t v10_low = vget_low_u8(v0_5.val[0]);
      uint8x8_t v11_low = vget_low_u8(v0_5.val[1]);
      uint8x8_t v12_low = vget_low_u8(v0_6.val[0]);
      uint8x8_t v13_low = vget_low_u8(v0_6.val[1]);
      uint8x8_t v14_low = vget_low_u8(v0_7.val[0]);
      uint8x8_t v15_low = vget_low_u8(v0_7.val[1]);

      if (bh & 8) {
        o = (uint8_t*) ((uintptr_t) o + oN_stride);
        vst1_u8(o, v15_low);
        if XNN_UNPREDICTABLE(block_width > 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v14_low);
        if XNN_UNPREDICTABLE(block_width >= 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v13_low);
        if XNN_UNPREDICTABLE(block_width > 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v12_low);
        if XNN_UNPREDICTABLE(block_width >= 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v11_low);
        if XNN_UNPREDICTABLE(block_width > 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v10_low);
        if XNN_UNPREDICTABLE(block_width >= 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v9_low);
        if XNN_UNPREDICTABLE(block_width > 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v8_low);
        if XNN_UNPREDICTABLE(block_width >= 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v7_low);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v6_low);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v5_low);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v4_low);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v3_low);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v2_low);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v1_low);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_u8(o, v0_low); o += 8;
        v0_low = vget_high_u8(v0_0.val[0]);
        v1_low = vget_high_u8(v0_0.val[1]);
        v2_low = vget_high_u8(v0_1.val[0]);
        v3_low = vget_high_u8(v0_1.val[1]);
        v4_low = vget_high_u8(v0_2.val[0]);
        v5_low = vget_high_u8(v0_2.val[1]);
        v6_low = vget_high_u8(v0_3.val[0]);
        v7_low = vget_high_u8(v0_3.val[1]);
        v8_low = vget_high_u8(v0_4.val[0]);
        v9_low = vget_high_u8(v0_4.val[1]);
        v10_low = vget_high_u8(v0_5.val[0]);
        v11_low = vget_high_u8(v0_5.val[1]);
        v12_low = vget_high_u8(v0_6.val[0]);
        v13_low = vget_high_u8(v0_6.val[1]);
        v14_low = vget_high_u8(v0_7.val[0]);
        v15_low = vget_high_u8(v0_7.val[1]);
      }

      if (bh & 4) {
        o = (uint8_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v15_low), 0);
        if XNN_UNPREDICTABLE(block_width > 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v14_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v13_low), 0);
        if XNN_UNPREDICTABLE(block_width > 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v12_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v11_low), 0);
        if XNN_UNPREDICTABLE(block_width > 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v10_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v9_low), 0);
        if XNN_UNPREDICTABLE(block_width > 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v8_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v7_low), 0);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v6_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v5_low), 0);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v4_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v3_low), 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v2_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v1_low), 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u32((void*) o, vreinterpret_u32_u8(v0_low), 0); o += 4;
        v0_low = vext_u8(v0_low, v0_low, 4);
        v1_low = vext_u8(v1_low, v1_low, 4);
        v2_low = vext_u8(v2_low, v2_low, 4);
        v3_low = vext_u8(v3_low, v3_low, 4);
        v4_low = vext_u8(v4_low, v4_low, 4);
        v5_low = vext_u8(v5_low, v5_low, 4);
        v6_low = vext_u8(v6_low, v6_low, 4);
        v7_low = vext_u8(v7_low, v7_low, 4);
        v8_low = vext_u8(v8_low, v8_low, 4);
        v9_low = vext_u8(v9_low, v9_low, 4);
        v10_low = vext_u8(v10_low, v10_low, 4);
        v11_low = vext_u8(v11_low, v11_low, 4);
        v12_low = vext_u8(v12_low, v12_low, 4);
        v13_low = vext_u8(v13_low, v13_low, 4);
        v14_low = vext_u8(v14_low, v14_low, 4);
        v15_low = vext_u8(v15_low, v15_low, 4);
      }
      if (bh & 2) {
        o = (uint8_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v15_low), 0);
        if XNN_UNPREDICTABLE(block_width > 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v14_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v13_low), 0);
        if XNN_UNPREDICTABLE(block_width > 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v12_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v11_low), 0);
        if XNN_UNPREDICTABLE(block_width > 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v10_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v9_low), 0);
        if XNN_UNPREDICTABLE(block_width > 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v8_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v7_low), 0);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v6_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v5_low), 0);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v4_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v3_low), 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v2_low), 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v1_low), 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u16((void*) o, vreinterpret_u16_u8(v0_low), 0); o += 2;
        v0_low = vext_u8(v0_low, v0_low, 2);
        v1_low = vext_u8(v1_low, v1_low, 2);
        v2_low = vext_u8(v2_low, v2_low, 2);
        v3_low = vext_u8(v3_low, v3_low, 2);
        v4_low = vext_u8(v4_low, v4_low, 2);
        v5_low = vext_u8(v5_low, v5_low, 2);
        v6_low = vext_u8(v6_low, v6_low, 2);
        v7_low = vext_u8(v7_low, v7_low, 2);
        v8_low = vext_u8(v8_low, v8_low, 2);
        v9_low = vext_u8(v9_low, v9_low, 2);
        v10_low = vext_u8(v10_low, v10_low, 2);
        v11_low = vext_u8(v11_low, v11_low, 2);
        v12_low = vext_u8(v12_low, v12_low, 2);
        v13_low = vext_u8(v13_low, v13_low, 2);
        v14_low = vext_u8(v14_low, v14_low, 2);
        v15_low = vext_u8(v15_low, v15_low, 2);
      }
      if (bh & 1) {
        o = (uint8_t*) ((uintptr_t) o + oN_stride);
        vst1_lane_u8(o, v15_low, 0);
        if XNN_UNPREDICTABLE(block_width > 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v14_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 15) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v13_low, 0);
        if XNN_UNPREDICTABLE(block_width > 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v12_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 13) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v11_low, 0);
        if XNN_UNPREDICTABLE(block_width > 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v10_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 11) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v9_low, 0);
        if XNN_UNPREDICTABLE(block_width > 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v8_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 9) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v7_low, 0);
        if XNN_UNPREDICTABLE(block_width > 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v6_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 7) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v5_low, 0);
        if XNN_UNPREDICTABLE(block_width > 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v4_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 5) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v3_low, 0);
        if XNN_UNPREDICTABLE(block_width > 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v2_low, 0);
        if XNN_UNPREDICTABLE(block_width >= 3) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v1_low, 0);
        if XNN_UNPREDICTABLE(block_width > 1) {
          o = (uint8_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1_lane_u8(o, v0_low, 0);
      }
    }

    i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
    o = (uint8_t*) ((uintptr_t) o + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}

void xnn_x8_zip_x2_ukernel__neon(
    size_t n,
    const uint8_t* input,
    uint8_t* output)
{
  const uint8_t* x = input;
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
  uint8_t* o = output;

  if (n >= 8) {
    do {
      uint8x8x2_t vxy;
      vxy.val[0] = vld1_u8(x); x += 8;
      vxy.val[1] = vld1_u8(y); y += 8;
      vst2_u8(o, vxy); o += 16;;
      n -= 8;
    } while (n >= 8);
    if (n != 0) {
      const size_t address_increment = n - 8;
      uint8x8x2_t vxy;
      vxy.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
      vxy.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
      vst2_u8((uint8_t*) ((uintptr_t) o + address_increment * 2), vxy);
    }
  } else {
    do {
      const uint8_t vx = *x++;
      const uint8_t vy = *y++;
      o[0] = vx;
      o[1] = vy;
      o += 2;
    } while (--n != 0);
  }
}

void xnn_x8_zip_x3_ukernel__neon(
    size_t n,
    const uint8_t* input,
    uint8_t* output)
{
  const uint8_t* x = input;
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
  uint8_t* o = output;

  if (n >= 8) {
    do {
      uint8x8x3_t vxyz;
      vxyz.val[0] = vld1_u8(x); x += 8;
      vxyz.val[1] = vld1_u8(y); y += 8;
      vxyz.val[2] = vld1_u8(z); z += 8;
      vst3_u8(o, vxyz); o += 24;
      n -= 8;
    } while (n >= 8);
    if (n != 0) {
      const size_t address_increment = n - 8;
      uint8x8x3_t vxyz;
      vxyz.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
      vxyz.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
      vxyz.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment));
      vst3_u8((uint8_t*) ((uintptr_t) o + address_increment * 3), vxyz);
    }
  } else {
    do {
      const uint8_t vx = *x++;
      const uint8_t vy = *y++;
      const uint8_t vz = *z++;
      o[0] = vx;
      o[1] = vy;
      o[2] = vz;
      o += 3;
    } while (--n != 0);
  }
}

void xnn_x8_zip_x4_ukernel__neon(
    size_t n,
    const uint8_t* input,
    uint8_t* output)
{
  const uint8_t* x = input;
  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
  const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
  uint8_t* o = output;

  if (n >= 8) {
    do {
      uint8x8x4_t vxyzw;
      vxyzw.val[0] = vld1_u8(x); x += 8;
      vxyzw.val[1] = vld1_u8(y); y += 8;
      vxyzw.val[2] = vld1_u8(z); z += 8;
      vxyzw.val[3] = vld1_u8(w); w += 8;
      vst4_u8(o, vxyzw); o += 32;
      n -= 8;
    } while (n >= 8);
    if (n != 0) {
      const size_t address_increment = n - 8;
      uint8x8x4_t vxyzw;
      vxyzw.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
      vxyzw.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
      vxyzw.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment));
      vxyzw.val[3] = vld1_u8((const uint8_t*) ((uintptr_t) w + address_increment));
      vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw);
    }
  } else {
    do {
      const uint8_t vx = *x++;
      const uint8_t vy = *y++;
      const uint8_t vz = *z++;
      const uint8_t vw = *w++;
      o[0] = vx;
      o[1] = vy;
      o[2] = vz;
      o[3] = vw;
      o += 4;
    } while (--n != 0);
  }
}

void xnn_x8_zip_xm_ukernel__neon(
    size_t n,
    size_t m,
    const uint8_t* input,
    uint8_t* output)
{
  const uint8_t* w = input;
  const size_t input_increment = n * 3;
  const size_t output_increment = 4 - m * n;
  const uint8_t* last_input = w + n * (m - 1);
  uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4));

  if (n >= 8) {
    for (size_t i = 0; i < m; i += 4) {
      size_t k = n;
      w = (const uint8_t*) ((uintptr_t) w + input_increment);
      if (w >= last_input) {
        w = last_input;
      }
      const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n);
      const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n);
      const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n);
      while (k >= 8) {
        const uint8x8_t vx = vld1_u8(x); x += 8;
        const uint8x8_t vy = vld1_u8(y); y += 8;
        const uint8x8_t vz = vld1_u8(z); z += 8;
        const uint8x8_t vw = vld1_u8(w); w += 8;

        const uint8x8x2_t vxy = vzip_u8(vx, vy);
        const uint8x8x2_t vzw = vzip_u8(vz, vw);
        const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0]));
        const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1]));

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 0);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 1);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 0);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 1);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 0);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 1);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 0);
        output = (uint8_t*) ((uintptr_t) output + m);

        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 1);
        output = (uint8_t*) ((uintptr_t) output + m);

        k -= 8;
      }
      if (k != 0) {
        const size_t address_increment = k - 8;
        x = (const uint8_t*) ((uintptr_t) x + address_increment);
        y = (const uint8_t*) ((uintptr_t) y + address_increment);
        z = (const uint8_t*) ((uintptr_t) z + address_increment);
        w = (const uint8_t*) ((uintptr_t) w + address_increment);
        const int64x1_t vshift = vmov_n_s64(8 * address_increment);

        const uint64x1_t vx = vshl_u64(vreinterpret_u64_u8(vld1_u8(x)), vshift);
        const uint64x1_t vy = vshl_u64(vreinterpret_u64_u8(vld1_u8(y)), vshift);
        const uint64x1_t vz = vshl_u64(vreinterpret_u64_u8(vld1_u8(z)), vshift);
        const uint64x1_t vw = vshl_u64(vreinterpret_u64_u8(vld1_u8(w)), vshift); w += 8;
        const uint8x8x2_t vxy = vzip_u8(vreinterpret_u8_u64(vx), vreinterpret_u8_u64(vy));
        const uint8x8x2_t vzw = vzip_u8(vreinterpret_u8_u64(vz), vreinterpret_u8_u64(vw));
        const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0]));
        const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1]));

        uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]);
        uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]);
        uint32x2_t vxyzw2 = vreinterpret_u32_u16(vxyzw_hi.val[0]);
        uint32x2_t vxyzw3 = vreinterpret_u32_u16(vxyzw_hi.val[1]);

        if (k & 4) {
          vst1_lane_u32((void*) output, vxyzw0, 0);
          output = (uint8_t*) ((uintptr_t) output + m);

          vst1_lane_u32((void*) output, vxyzw0, 1);
          output = (uint8_t*) ((uintptr_t) output + m);

          vst1_lane_u32((void*) output, vxyzw1, 0);
          output = (uint8_t*) ((uintptr_t) output + m);

          vst1_lane_u32((void*) output, vxyzw1, 1);
          output = (uint8_t*) ((uintptr_t) output + m);

          vxyzw0 = vxyzw2;
          vxyzw1 = vxyzw3;
        }

        if (k & 2) {
          vst1_lane_u32((void*) output, vxyzw0, 0);
          output = (uint8_t*) ((uintptr_t) output + m);

          vst1_lane_u32((void*) output, vxyzw0, 1);
          output = (uint8_t*) ((uintptr_t) output + m);

          vxyzw0 = vxyzw1;
        }
        if (k & 1) {
          vst1_lane_u32((void*) output, vxyzw0, 0);
          output = (uint8_t*) ((uintptr_t) output + m);
        }
      }
      output = (uint8_t*) ((uintptr_t) output + output_increment);
      if (output > last_output) {
        output = last_output;
      }
    }
  } else {
    const uint8_t* i = input;
    uint8_t* o = output;
    size_t k = n;
    do {
      size_t l = m;
      const uint8_t* ii = i++;
      do {
        *o++ = *ii;
        ii += n;
      } while (--l != 0);
    } while (--k != 0);
  }
}

void xnn_xx_fill_ukernel__neon_x64(
    size_t rows,
    size_t channels,
    void* output,
    size_t output_stride,
    const uint32_t fill_pattern)
{
  assert(rows != 0);
  assert(channels != 0);

  const size_t output_increment = output_stride - channels;

  const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern));
  do {
    size_t c = channels;
    for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
    }
    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
      vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16);
    }
    if XNN_UNLIKELY(c != 0) {
      if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
        vst1_u8(output, vget_low_u8(vfill_pattern)); output = ((uint8_t*) output + 8);
      }
      if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
        vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = ((uint8_t*) output + 4);
      }
      uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern);
      if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
        vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = ((uint8_t*) output + 2);
        vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2);
      }
      if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vfill_subpattern, 0); output = ((uint8_t*) output + 1);
      }
    }
    output = (void*) ((uintptr_t) output + output_increment);
  } while (--rows != 0);
}

void xnn_xx_pad_ukernel__neon(
    size_t rows,
    size_t channels,
    size_t pre_padding,
    size_t post_padding,
    const void* input,
    size_t input_stride,
    void* output,
    size_t output_stride,
    uint32_t fill_pattern) XNN_OOB_READS
{
  const size_t input_increment = input_stride - channels;
  const size_t output_increment = output_stride - (pre_padding + channels + post_padding);

  const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern));
  do {
    // Pre-pad input channels.
    size_t l = pre_padding;
    if XNN_LIKELY(l != 0) {
      for (; l >= 16 * sizeof(uint8_t); l -= 16 * sizeof(uint8_t)) {
        vst1q_u8(output, vfill_pattern); output = (uint8_t*) output + 16;
      }
      if (l & (8 * sizeof(uint8_t))) {
        vst1_u8(output, vget_low_u8(vfill_pattern)); output = (uint8_t*) output + 8;
      }
      if (l & (4 * sizeof(uint8_t))) {
        vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = (uint8_t*) output + 4;
      }
      uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern);
      if (l & (2 * sizeof(uint8_t))) {
        vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = (uint8_t*) output + 2;
        vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2);
      }
      if (l & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vfill_subpattern, 0); output = (uint8_t*) output + 1;
      }
    }

    // Copy input channels.
    size_t c = channels;
    for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
      const uint8x16_t vdata = vld1q_u8(input); input = (const uint8_t*) input + 16;
      vst1q_u8(output, vdata); output = (uint8_t*) output + 16;
    }
    if XNN_UNLIKELY(c != 0) {
      uint8x16_t vdata = vld1q_u8(input); input = (const void*) ((uintptr_t) input + c);

      uint8x8_t vsubdata = vget_low_u8(vdata);
      if (c & (8 * sizeof(uint8_t))) {
        vst1_u8(output, vsubdata); output = (uint8_t*) output + 8;
        vsubdata = vget_high_u8(vdata);
      }
      if (c & (4 * sizeof(uint8_t))) {
        vst1_lane_u32(output, vreinterpret_u32_u8(vsubdata), 0); output = (uint8_t*) output + 4;
        vsubdata = vext_u8(vsubdata, vsubdata, 4);
      }
      if (c & (2 * sizeof(uint8_t))) {
        vst1_lane_u16(output, vreinterpret_u16_u8(vsubdata), 0); output = (uint8_t*) output + 2;
        vsubdata = vext_u8(vsubdata, vsubdata, 2);
      }
      if (c & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vsubdata, 0); output = (uint8_t*) output + 1;
      }
    }

    // Post-pad input channels.
    size_t r = post_padding;
    if XNN_LIKELY(r != 0) {
      for (; r >= 16 * sizeof(uint8_t); r -= 16 * sizeof(uint8_t)) {
        vst1q_u8(output, vfill_pattern); output = (uint8_t*) output + 16;
      }
      if (r & (8 * sizeof(uint8_t))) {
        vst1_u8(output, vget_low_u8(vfill_pattern)); output = (uint8_t*) output + 8;
      }
      if (r & (4 * sizeof(uint8_t))) {
        vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = (uint8_t*) output + 4;
      }
      uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern);
      if (r & (2 * sizeof(uint8_t))) {
        vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = (uint8_t*) output + 2;
        vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2);
      }
      if (r & (1 * sizeof(uint8_t))) {
        vst1_lane_u8(output, vfill_subpattern, 0); output = (uint8_t*) output + 1;
      }
    }

    input = (const uint32_t*) ((uintptr_t) input + input_increment);
    output = (uint32_t*) ((uintptr_t) output + output_increment);
  } while (--rows != 0);
}