test / src /amalgam /gen /neonfma.c
Androidonnxfork's picture
Upload folder using huggingface_hub
8b7c501
raw
history blame
223 kB
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <arm_neon.h>
#include <xnnpack/common.h>
#include <xnnpack/dwconv.h>
#include <xnnpack/gemm.h>
#include <xnnpack/ibilinear.h>
#include <xnnpack/igemm.h>
#include <xnnpack/math-stubs.h>
#include <xnnpack/math.h>
#include <xnnpack/microparams.h>
#include <xnnpack/prefetch.h>
#include <xnnpack/raddstoreexpminusmax.h>
#include <xnnpack/spmm.h>
#include <xnnpack/vmulcaddc.h>
#include <xnnpack/vunary.h>
void xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = input[5];
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = input[6];
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = input[7];
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = input[8];
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
const float* i9 = input[9];
assert(i9 != NULL);
if XNN_UNPREDICTABLE(i9 != zero) {
i9 = (const float*) ((uintptr_t) i9 + input_offset);
}
const float* i10 = input[10];
assert(i10 != NULL);
if XNN_UNPREDICTABLE(i10 != zero) {
i10 = (const float*) ((uintptr_t) i10 + input_offset);
}
const float* i11 = input[11];
assert(i11 != NULL);
if XNN_UNPREDICTABLE(i11 != zero) {
i11 = (const float*) ((uintptr_t) i11 + input_offset);
}
const float* i12 = input[12];
assert(i12 != NULL);
if XNN_UNPREDICTABLE(i12 != zero) {
i12 = (const float*) ((uintptr_t) i12 + input_offset);
}
const float* i13 = input[13];
assert(i13 != NULL);
if XNN_UNPREDICTABLE(i13 != zero) {
i13 = (const float*) ((uintptr_t) i13 + input_offset);
}
const float* i14 = input[14];
assert(i14 != NULL);
if XNN_UNPREDICTABLE(i14 != zero) {
i14 = (const float*) ((uintptr_t) i14 + input_offset);
}
const float* i15 = input[15];
assert(i15 != NULL);
if XNN_UNPREDICTABLE(i15 != zero) {
i15 = (const float*) ((uintptr_t) i15 + input_offset);
}
const float* i16 = input[16];
assert(i16 != NULL);
if XNN_UNPREDICTABLE(i16 != zero) {
i16 = (const float*) ((uintptr_t) i16 + input_offset);
}
const float* i17 = input[17];
assert(i17 != NULL);
if XNN_UNPREDICTABLE(i17 != zero) {
i17 = (const float*) ((uintptr_t) i17 + input_offset);
}
const float* i18 = input[18];
assert(i18 != NULL);
if XNN_UNPREDICTABLE(i18 != zero) {
i18 = (const float*) ((uintptr_t) i18 + input_offset);
}
const float* i19 = input[19];
assert(i19 != NULL);
if XNN_UNPREDICTABLE(i19 != zero) {
i19 = (const float*) ((uintptr_t) i19 + input_offset);
}
const float* i20 = input[20];
assert(i20 != NULL);
if XNN_UNPREDICTABLE(i20 != zero) {
i20 = (const float*) ((uintptr_t) i20 + input_offset);
}
const float* i21 = input[21];
assert(i21 != NULL);
if XNN_UNPREDICTABLE(i21 != zero) {
i21 = (const float*) ((uintptr_t) i21 + input_offset);
}
const float* i22 = input[22];
assert(i22 != NULL);
if XNN_UNPREDICTABLE(i22 != zero) {
i22 = (const float*) ((uintptr_t) i22 + input_offset);
}
const float* i23 = input[23];
assert(i23 != NULL);
if XNN_UNPREDICTABLE(i23 != zero) {
i23 = (const float*) ((uintptr_t) i23 + input_offset);
}
const float* i24 = input[24];
assert(i24 != NULL);
if XNN_UNPREDICTABLE(i24 != zero) {
i24 = (const float*) ((uintptr_t) i24 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi5x4567, vk5x4567);
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567);
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi7x4567, vk7x4567);
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567);
const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4;
const float32x4_t vi9x4567 = vld1q_f32(i9); i9 += 4;
const float32x4_t vk9x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk9x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi9x4567, vk9x4567);
const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4;
const float32x4_t vi10x4567 = vld1q_f32(i10); i10 += 4;
const float32x4_t vk10x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk10x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi10x4567, vk10x4567);
const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4;
const float32x4_t vi11x4567 = vld1q_f32(i11); i11 += 4;
const float32x4_t vk11x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk11x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi11x4567, vk11x4567);
const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4;
const float32x4_t vi12x4567 = vld1q_f32(i12); i12 += 4;
const float32x4_t vk12x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk12x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi12x4567, vk12x4567);
const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4;
const float32x4_t vi13x4567 = vld1q_f32(i13); i13 += 4;
const float32x4_t vk13x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk13x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi13x4567, vk13x4567);
const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4;
const float32x4_t vi14x4567 = vld1q_f32(i14); i14 += 4;
const float32x4_t vk14x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk14x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi14x4567, vk14x4567);
const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4;
const float32x4_t vi15x4567 = vld1q_f32(i15); i15 += 4;
const float32x4_t vk15x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk15x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi15x4567, vk15x4567);
const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4;
const float32x4_t vi16x4567 = vld1q_f32(i16); i16 += 4;
const float32x4_t vk16x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk16x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi16x4567, vk16x4567);
const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4;
const float32x4_t vi17x4567 = vld1q_f32(i17); i17 += 4;
const float32x4_t vk17x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk17x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi17x4567, vk17x4567);
const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4;
const float32x4_t vi18x4567 = vld1q_f32(i18); i18 += 4;
const float32x4_t vk18x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk18x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi18x4567, vk18x4567);
const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4;
const float32x4_t vi19x4567 = vld1q_f32(i19); i19 += 4;
const float32x4_t vk19x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk19x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi19x4567, vk19x4567);
const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4;
const float32x4_t vi20x4567 = vld1q_f32(i20); i20 += 4;
const float32x4_t vk20x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk20x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi20x4567, vk20x4567);
const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4;
const float32x4_t vi21x4567 = vld1q_f32(i21); i21 += 4;
const float32x4_t vk21x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk21x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi21x4567, vk21x4567);
const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4;
const float32x4_t vi22x4567 = vld1q_f32(i22); i22 += 4;
const float32x4_t vk22x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk22x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi22x4567, vk22x4567);
const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4;
const float32x4_t vi23x4567 = vld1q_f32(i23); i23 += 4;
const float32x4_t vk23x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk23x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi23x4567, vk23x4567);
const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4;
const float32x4_t vi24x4567 = vld1q_f32(i24); i24 += 4;
const float32x4_t vk24x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk24x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi24x4567, vk24x4567);
// Add up all accumulators to vacc01234567p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
}
for (; c >= 4; c -= 4) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w + 4);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w + 12);
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w + 20);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w + 28);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w + 36);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
const float32x4_t vk5x0123 = vld1q_f32(w + 44);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
const float32x4_t vk6x0123 = vld1q_f32(w + 52);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
const float32x4_t vk7x0123 = vld1q_f32(w + 60);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
const float32x4_t vk8x0123 = vld1q_f32(w + 68);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4;
const float32x4_t vk9x0123 = vld1q_f32(w + 76);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123);
const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4;
const float32x4_t vk10x0123 = vld1q_f32(w + 84);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123);
const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4;
const float32x4_t vk11x0123 = vld1q_f32(w + 92);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123);
const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4;
const float32x4_t vk12x0123 = vld1q_f32(w + 100);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123);
const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4;
const float32x4_t vk13x0123 = vld1q_f32(w + 108);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123);
const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4;
const float32x4_t vk14x0123 = vld1q_f32(w + 116);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123);
const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4;
const float32x4_t vk15x0123 = vld1q_f32(w + 124);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123);
const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4;
const float32x4_t vk16x0123 = vld1q_f32(w + 132);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123);
const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4;
const float32x4_t vk17x0123 = vld1q_f32(w + 140);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123);
const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4;
const float32x4_t vk18x0123 = vld1q_f32(w + 148);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123);
const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4;
const float32x4_t vk19x0123 = vld1q_f32(w + 156);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123);
const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4;
const float32x4_t vk20x0123 = vld1q_f32(w + 164);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123);
const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4;
const float32x4_t vk21x0123 = vld1q_f32(w + 172);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123);
const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4;
const float32x4_t vk22x0123 = vld1q_f32(w + 180);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123);
const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4;
const float32x4_t vk23x0123 = vld1q_f32(w + 188);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123);
const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4;
const float32x4_t vk24x0123 = vld1q_f32(w + 196);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vst1q_f32(output, vacc0123); output += 4;
}
if XNN_UNLIKELY(c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(w);
const float32x4_t vi0x0123 = vld1q_f32(i0);
const float32x4_t vk0x0123 = vld1q_f32(w + 8);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1);
const float32x4_t vk1x0123 = vld1q_f32(w + 16);
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2);
const float32x4_t vk2x0123 = vld1q_f32(w + 24);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3);
const float32x4_t vk3x0123 = vld1q_f32(w + 32);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4);
const float32x4_t vk4x0123 = vld1q_f32(w + 40);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
const float32x4_t vi5x0123 = vld1q_f32(i5);
const float32x4_t vk5x0123 = vld1q_f32(w + 48);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
const float32x4_t vi6x0123 = vld1q_f32(i6);
const float32x4_t vk6x0123 = vld1q_f32(w + 56);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
const float32x4_t vi7x0123 = vld1q_f32(i7);
const float32x4_t vk7x0123 = vld1q_f32(w + 64);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
const float32x4_t vi8x0123 = vld1q_f32(i8);
const float32x4_t vk8x0123 = vld1q_f32(w + 72);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
const float32x4_t vi9x0123 = vld1q_f32(i9);
const float32x4_t vk9x0123 = vld1q_f32(w + 80);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123);
const float32x4_t vi10x0123 = vld1q_f32(i10);
const float32x4_t vk10x0123 = vld1q_f32(w + 88);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123);
const float32x4_t vi11x0123 = vld1q_f32(i11);
const float32x4_t vk11x0123 = vld1q_f32(w + 96);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123);
const float32x4_t vi12x0123 = vld1q_f32(i12);
const float32x4_t vk12x0123 = vld1q_f32(w + 104);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123);
const float32x4_t vi13x0123 = vld1q_f32(i13);
const float32x4_t vk13x0123 = vld1q_f32(w + 112);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123);
const float32x4_t vi14x0123 = vld1q_f32(i14);
const float32x4_t vk14x0123 = vld1q_f32(w + 120);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123);
const float32x4_t vi15x0123 = vld1q_f32(i15);
const float32x4_t vk15x0123 = vld1q_f32(w + 128);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123);
const float32x4_t vi16x0123 = vld1q_f32(i16);
const float32x4_t vk16x0123 = vld1q_f32(w + 136);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123);
const float32x4_t vi17x0123 = vld1q_f32(i17);
const float32x4_t vk17x0123 = vld1q_f32(w + 144);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123);
const float32x4_t vi18x0123 = vld1q_f32(i18);
const float32x4_t vk18x0123 = vld1q_f32(w + 152);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123);
const float32x4_t vi19x0123 = vld1q_f32(i19);
const float32x4_t vk19x0123 = vld1q_f32(w + 160);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123);
const float32x4_t vi20x0123 = vld1q_f32(i20);
const float32x4_t vk20x0123 = vld1q_f32(w + 168);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123);
const float32x4_t vi21x0123 = vld1q_f32(i21);
const float32x4_t vk21x0123 = vld1q_f32(w + 176);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123);
const float32x4_t vi22x0123 = vld1q_f32(i22);
const float32x4_t vk22x0123 = vld1q_f32(w + 184);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123);
const float32x4_t vi23x0123 = vld1q_f32(i23);
const float32x4_t vk23x0123 = vld1q_f32(w + 192);
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123);
const float32x4_t vi24x0123 = vld1q_f32(i24);
const float32x4_t vk24x0123 = vld1q_f32(w + 200);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
float32x2_t vacc01 = vget_low_f32(vacc0123);
if (c & 2) {
vst1_f32(output, vacc01); output += 2;
vacc01 = vget_high_f32(vacc0123);
}
if (c & 1) {
vst1_lane_f32(output, vacc01, 0); output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
}
for (; c >= 4; c -= 4) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w + 4);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w + 12);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w + 20);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vst1q_f32(output, vacc0123); output += 4;
}
if XNN_UNLIKELY(c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(w);
const float32x4_t vi0x0123 = vld1q_f32(i0);
const float32x4_t vk0x0123 = vld1q_f32(w + 8);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1);
const float32x4_t vk1x0123 = vld1q_f32(w + 16);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2);
const float32x4_t vk2x0123 = vld1q_f32(w + 24);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
float32x2_t vacc01 = vget_low_f32(vacc0123);
if (c & 2) {
vst1_f32(output, vacc01); output += 2;
vacc01 = vget_high_f32(vacc0123);
}
if (c & 1) {
vst1_lane_f32(output, vacc01, 0); output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
}
for (; c >= 4; c -= 4) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w + 4);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w + 12);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w + 20);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w + 28);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vst1q_f32(output, vacc0123); output += 4;
}
if XNN_UNLIKELY(c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(w);
const float32x4_t vi0x0123 = vld1q_f32(i0);
const float32x4_t vk0x0123 = vld1q_f32(w + 8);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1);
const float32x4_t vk1x0123 = vld1q_f32(w + 16);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2);
const float32x4_t vk2x0123 = vld1q_f32(w + 24);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3);
const float32x4_t vk3x0123 = vld1q_f32(w + 32);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
float32x2_t vacc01 = vget_low_f32(vacc0123);
if (c & 2) {
vst1_f32(output, vacc01); output += 2;
vacc01 = vget_high_f32(vacc0123);
}
if (c & 1) {
vst1_lane_f32(output, vacc01, 0); output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
size_t kernel_size,
float* buffer,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
assert(kernel_size > 5);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
do {
const float* w = weights;
// First pass to process 5 inputs.
{
float* b = buffer;
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
input += 5;
// Process c channels and write to buffer.
size_t c = round_up_po2(channels, 4);
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
vst1q_f32(b, vacc0123p0); b += 4;
vst1q_f32(b, vacc4567p0); b += 4;
}
if (c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vst1q_f32(b, vacc0123p0); b += 4;
}
}
// Middle pass to process 5 inputs in each iteration.
for (size_t ks = kernel_size - 5; ks > 5; ks -= 5) {
float* b = buffer;
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
input += 5;
size_t c = round_up_po2(channels, 4);
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(b);
float32x4_t vacc4567p0 = vld1q_f32(b + 4);
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
vst1q_f32(b, vacc0123p0); b += 4;
vst1q_f32(b, vacc4567p0); b += 4;
}
if (c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(b);
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vst1q_f32(b, vacc0123p0); b += 4;
}
}
// Last pass to process up to 5 inputs.
{
float* b = buffer;
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
size_t c = channels;
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(b); b += 4;
float32x4_t vacc4567p0 = vld1q_f32(b); b += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
}
for (; c >= 4; c -= 4) {
float32x4_t vacc0123p0 = vld1q_f32(b); b += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vst1q_f32(output, vacc0123); output += 4;
}
if XNN_UNLIKELY(c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(b);
const float32x4_t vi0x0123 = vld1q_f32(i0);
float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1);
float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2);
float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3);
float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4);
float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
// Add up all accumulators to vacc0123p0
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
float32x2_t vacc01 = vget_low_f32(vacc0123);
if (c & 2) {
vst1_f32(output, vacc01); output += 2;
vacc01 = vget_high_f32(vacc0123);
}
if (c & 1) {
vst1_lane_f32(output, vacc01, 0); output += 1;
}
}
}
input = (const float**) ((uintptr_t) input + input_stride);
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma(
size_t channels,
size_t output_width,
const float** input,
const float* weights,
float* output,
intptr_t input_stride,
size_t output_increment,
size_t input_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(channels != 0);
assert(output_width != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
do {
const float* i0 = input[0];
assert(i0 != NULL);
if XNN_UNPREDICTABLE(i0 != zero) {
i0 = (const float*) ((uintptr_t) i0 + input_offset);
}
const float* i1 = input[1];
assert(i1 != NULL);
if XNN_UNPREDICTABLE(i1 != zero) {
i1 = (const float*) ((uintptr_t) i1 + input_offset);
}
const float* i2 = input[2];
assert(i2 != NULL);
if XNN_UNPREDICTABLE(i2 != zero) {
i2 = (const float*) ((uintptr_t) i2 + input_offset);
}
const float* i3 = input[3];
assert(i3 != NULL);
if XNN_UNPREDICTABLE(i3 != zero) {
i3 = (const float*) ((uintptr_t) i3 + input_offset);
}
const float* i4 = input[4];
assert(i4 != NULL);
if XNN_UNPREDICTABLE(i4 != zero) {
i4 = (const float*) ((uintptr_t) i4 + input_offset);
}
const float* i5 = input[5];
assert(i5 != NULL);
if XNN_UNPREDICTABLE(i5 != zero) {
i5 = (const float*) ((uintptr_t) i5 + input_offset);
}
const float* i6 = input[6];
assert(i6 != NULL);
if XNN_UNPREDICTABLE(i6 != zero) {
i6 = (const float*) ((uintptr_t) i6 + input_offset);
}
const float* i7 = input[7];
assert(i7 != NULL);
if XNN_UNPREDICTABLE(i7 != zero) {
i7 = (const float*) ((uintptr_t) i7 + input_offset);
}
const float* i8 = input[8];
assert(i8 != NULL);
if XNN_UNPREDICTABLE(i8 != zero) {
i8 = (const float*) ((uintptr_t) i8 + input_offset);
}
input = (const float**) ((uintptr_t) input + input_stride);
size_t c = channels;
const float* w = weights;
for (; c >= 8; c -= 8) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi5x4567, vk5x4567);
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567);
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi7x4567, vk7x4567);
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
}
for (; c >= 4; c -= 4) {
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vk0x0123 = vld1q_f32(w + 4);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vk1x0123 = vld1q_f32(w + 12);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vk2x0123 = vld1q_f32(w + 20);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vk3x0123 = vld1q_f32(w + 28);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
const float32x4_t vk4x0123 = vld1q_f32(w + 36);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
const float32x4_t vk5x0123 = vld1q_f32(w + 44);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
const float32x4_t vk6x0123 = vld1q_f32(w + 52);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
const float32x4_t vk7x0123 = vld1q_f32(w + 60);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
const float32x4_t vk8x0123 = vld1q_f32(w + 68);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
vst1q_f32(output, vacc0123); output += 4;
}
if XNN_UNLIKELY(c != 0) {
float32x4_t vacc0123p0 = vld1q_f32(w);
const float32x4_t vi0x0123 = vld1q_f32(i0);
const float32x4_t vk0x0123 = vld1q_f32(w + 8);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
const float32x4_t vi1x0123 = vld1q_f32(i1);
const float32x4_t vk1x0123 = vld1q_f32(w + 16);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
const float32x4_t vi2x0123 = vld1q_f32(i2);
const float32x4_t vk2x0123 = vld1q_f32(w + 24);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
const float32x4_t vi3x0123 = vld1q_f32(i3);
const float32x4_t vk3x0123 = vld1q_f32(w + 32);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
const float32x4_t vi4x0123 = vld1q_f32(i4);
const float32x4_t vk4x0123 = vld1q_f32(w + 40);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
const float32x4_t vi5x0123 = vld1q_f32(i5);
const float32x4_t vk5x0123 = vld1q_f32(w + 48);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
const float32x4_t vi6x0123 = vld1q_f32(i6);
const float32x4_t vk6x0123 = vld1q_f32(w + 56);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
const float32x4_t vi7x0123 = vld1q_f32(i7);
const float32x4_t vk7x0123 = vld1q_f32(w + 64);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
const float32x4_t vi8x0123 = vld1q_f32(i8);
const float32x4_t vk8x0123 = vld1q_f32(w + 72);
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
vacc0123 = vminq_f32(vacc0123, vmax);
float32x2_t vacc01 = vget_low_f32(vacc0123);
if (c & 2) {
vst1_f32(output, vacc01); output += 2;
vacc01 = vget_high_f32(vacc0123);
}
if (c & 1) {
vst1_lane_f32(output, vacc01, 0); output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_width != 0);
}
void xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
}
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a0 = (const float*) ((uintptr_t) a0 - kc);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c0, vacc0x01); c0 += 2;
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
a2 = a1;
c2 = c1;
}
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc1x0123 = vacc0x0123;
float32x4_t vacc1x4567 = vacc0x4567;
float32x4_t vacc2x0123 = vacc0x0123;
float32x4_t vacc2x4567 = vacc0x4567;
float32x4_t vacc3x0123 = vacc0x0123;
float32x4_t vacc3x4567 = vacc0x4567;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
float32x4_t va1 = vld1q_f32(a1); a1 += 4;
float32x4_t va2 = vld1q_f32(a2); a2 += 4;
float32x4_t va3 = vld1q_f32(a3); a3 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3);
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k);
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k);
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0);
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0);
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0);
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0);
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1);
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1);
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1);
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1);
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2);
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2);
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2);
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2);
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3);
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3);
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3);
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3);
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3);
}
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
vacc2x0123 = vminq_f32(vacc2x0123, vmax);
vacc3x0123 = vminq_f32(vacc3x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
vacc1x4567 = vminq_f32(vacc1x4567, vmax);
vacc2x4567 = vminq_f32(vacc2x4567, vmax);
vacc3x4567 = vminq_f32(vacc3x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c3, vacc3x0123);
vst1q_f32(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
vst1q_f32(c2, vacc2x0123);
vst1q_f32(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
vst1q_f32(c1, vacc1x0123);
vst1q_f32(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a3 = (const float*) ((uintptr_t) a3 - kc);
a2 = (const float*) ((uintptr_t) a2 - kc);
a1 = (const float*) ((uintptr_t) a1 - kc);
a0 = (const float*) ((uintptr_t) a0 - kc);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c3, vacc3x0123); c3 += 4;
vst1q_f32(c2, vacc2x0123); c2 += 4;
vst1q_f32(c1, vacc1x0123); c1 += 4;
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c3, vacc3x01); c3 += 2;
vst1_f32(c2, vacc2x01); c2 += 2;
vst1_f32(c1, vacc1x01); c1 += 2;
vst1_f32(c0, vacc0x01); c0 += 2;
vacc3x01 = vget_high_f32(vacc3x0123);
vacc2x01 = vget_high_f32(vacc2x0123);
vacc1x01 = vget_high_f32(vacc1x0123);
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c3, vacc3x01, 0);
vst1_lane_f32(c2, vacc2x01, 0);
vst1_lane_f32(c1, vacc1x01, 0);
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
size_t a_stride,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mr != 0);
assert(mr <= 6);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
const float* a0 = a;
float* c0 = c;
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
a2 = a1;
c2 = c1;
}
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr < 4) {
a3 = a2;
c3 = c2;
}
const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 4) {
a4 = a3;
c4 = c3;
}
const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
if XNN_UNPREDICTABLE(mr != 6) {
a5 = a4;
c5 = c4;
}
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc1x0123 = vacc0x0123;
float32x4_t vacc1x4567 = vacc0x4567;
float32x4_t vacc2x0123 = vacc0x0123;
float32x4_t vacc2x4567 = vacc0x4567;
float32x4_t vacc3x0123 = vacc0x0123;
float32x4_t vacc3x4567 = vacc0x4567;
float32x4_t vacc4x0123 = vacc0x0123;
float32x4_t vacc4x4567 = vacc0x4567;
float32x4_t vacc5x0123 = vacc0x0123;
float32x4_t vacc5x4567 = vacc0x4567;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
float32x4_t va1 = vld1q_f32(a1); a1 += 4;
float32x4_t va2 = vld1q_f32(a2); a2 += 4;
float32x4_t va3 = vld1q_f32(a3); a3 += 4;
float32x4_t va4 = vld1q_f32(a4); a4 += 4;
float32x4_t va5 = vld1q_f32(a5); a5 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3);
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k);
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k);
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k);
float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k);
float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0);
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0);
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0);
const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0);
const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0);
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0);
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0);
const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0);
const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1);
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1);
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1);
const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1);
const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1);
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1);
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1);
const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1);
const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2);
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2);
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2);
const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2);
const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2);
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2);
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2);
const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2);
const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3);
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3);
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3);
const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3);
const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3);
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3);
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3);
const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3);
const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3);
}
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
vacc2x0123 = vminq_f32(vacc2x0123, vmax);
vacc3x0123 = vminq_f32(vacc3x0123, vmax);
vacc4x0123 = vminq_f32(vacc4x0123, vmax);
vacc5x0123 = vminq_f32(vacc5x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
vacc1x4567 = vminq_f32(vacc1x4567, vmax);
vacc2x4567 = vminq_f32(vacc2x4567, vmax);
vacc3x4567 = vminq_f32(vacc3x4567, vmax);
vacc4x4567 = vminq_f32(vacc4x4567, vmax);
vacc5x4567 = vminq_f32(vacc5x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c5, vacc5x0123);
vst1q_f32(c5 + 4, vacc5x4567);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
vst1q_f32(c4, vacc4x0123);
vst1q_f32(c4 + 4, vacc4x4567);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
vst1q_f32(c3, vacc3x0123);
vst1q_f32(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
vst1q_f32(c2, vacc2x0123);
vst1q_f32(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
vst1q_f32(c1, vacc1x0123);
vst1q_f32(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a5 = (const float*) ((uintptr_t) a5 - kc);
a4 = (const float*) ((uintptr_t) a4 - kc);
a3 = (const float*) ((uintptr_t) a3 - kc);
a2 = (const float*) ((uintptr_t) a2 - kc);
a1 = (const float*) ((uintptr_t) a1 - kc);
a0 = (const float*) ((uintptr_t) a0 - kc);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c5, vacc5x0123); c5 += 4;
vst1q_f32(c4, vacc4x0123); c4 += 4;
vst1q_f32(c3, vacc3x0123); c3 += 4;
vst1q_f32(c2, vacc2x0123); c2 += 4;
vst1q_f32(c1, vacc1x0123); c1 += 4;
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc5x0123 = vacc5x4567;
vacc4x0123 = vacc4x4567;
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c5, vacc5x01); c5 += 2;
vst1_f32(c4, vacc4x01); c4 += 2;
vst1_f32(c3, vacc3x01); c3 += 2;
vst1_f32(c2, vacc2x01); c2 += 2;
vst1_f32(c1, vacc1x01); c1 += 2;
vst1_f32(c0, vacc0x01); c0 += 2;
vacc5x01 = vget_high_f32(vacc5x0123);
vacc4x01 = vget_high_f32(vacc4x0123);
vacc3x01 = vget_high_f32(vacc3x0123);
vacc2x01 = vget_high_f32(vacc2x0123);
vacc1x01 = vget_high_f32(vacc1x0123);
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c5, vacc5x01, 0);
vst1_lane_f32(c4, vacc4x01, 0);
vst1_lane_f32(c3, vacc3x01, 0);
vst1_lane_f32(c2, vacc2x01, 0);
vst1_lane_f32(c1, vacc1x01, 0);
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_ibilinear_chw_ukernel__neonfma_p8(
size_t output_pixels,
size_t channels,
const float** restrict input,
size_t input_offset,
const float* restrict weights,
float* restrict output,
size_t input_increment) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(channels != 0);
assert(input_increment % sizeof(float) == 0);
do {
const float** i = input;
const float* w = weights;
size_t p = output_pixels;
for (; p >= 8; p -= 8) {
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
i += 2 * 8;
const float32x4x2_t vw0123 = vld2q_f32(w + 0);
const float32x4x2_t vw4567 = vld2q_f32(w + 8);
w += 2 * 8;
const float32x2_t vtltr0 = vld1_f32(itl0);
const float32x2_t vblbr0 = vld1_f32(ibl0);
const float32x2_t vtltr1 = vld1_f32(itl1);
const float32x2_t vblbr1 = vld1_f32(ibl1);
const float32x2_t vtltr2 = vld1_f32(itl2);
const float32x2_t vblbr2 = vld1_f32(ibl2);
const float32x2_t vtltr3 = vld1_f32(itl3);
const float32x2_t vblbr3 = vld1_f32(ibl3);
const float32x2_t vtltr4 = vld1_f32(itl4);
const float32x2_t vblbr4 = vld1_f32(ibl4);
const float32x2_t vtltr5 = vld1_f32(itl5);
const float32x2_t vblbr5 = vld1_f32(ibl5);
const float32x2_t vtltr6 = vld1_f32(itl6);
const float32x2_t vblbr6 = vld1_f32(ibl6);
const float32x2_t vtltr7 = vld1_f32(itl7);
const float32x2_t vblbr7 = vld1_f32(ibl7);
const float32x4_t valphah0123 = vw0123.val[0];
const float32x4_t valphav0123 = vw0123.val[1];
const float32x4_t valphah4567 = vw4567.val[0];
const float32x4_t valphav4567 = vw4567.val[1];
const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5);
const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5);
const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7);
const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7);
const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45);
const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67);
const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23);
const float32x4_t vld0123 = vld_t0123.val[0];
const float32x4_t vrd0123 = vld_t0123.val[1];
const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67);
const float32x4_t vld4567 = vld_t4567.val[0];
const float32x4_t vrd4567 = vld_t4567.val[1];
const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23);
const float32x4_t vtl0123 = vtl_t0123.val[0];
const float32x4_t vtr0123 = vtl_t0123.val[1];
const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67);
const float32x4_t vtl4567 = vtl_t4567.val[0];
const float32x4_t vtr4567 = vtl_t4567.val[1];
const float32x4_t vl0123 = vfmaq_f32(vtl0123, vld0123, valphav0123);
const float32x4_t vr0123 = vfmaq_f32(vtr0123, vrd0123, valphav0123);
const float32x4_t vl4567 = vfmaq_f32(vtl4567, vld4567, valphav4567);
const float32x4_t vr4567 = vfmaq_f32(vtr4567, vrd4567, valphav4567);
const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123);
const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567);
const float32x4_t vo0123 = vfmaq_f32(vl0123, vd0123, valphah0123);
const float32x4_t vo4567 = vfmaq_f32(vl4567, vd4567, valphah4567);
vst1q_f32(output + 0, vo0123);
vst1q_f32(output + 4, vo4567);
output += 8;
}
for (; p >= 4; p -= 4) {
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
i += 8;
const float32x4x2_t vw = vld2q_f32(w);
w += 8;
const float32x2_t vtltr0 = vld1_f32(itl0);
const float32x2_t vblbr0 = vld1_f32(ibl0);
const float32x2_t vtltr1 = vld1_f32(itl1);
const float32x2_t vblbr1 = vld1_f32(ibl1);
const float32x2_t vtltr2 = vld1_f32(itl2);
const float32x2_t vblbr2 = vld1_f32(ibl2);
const float32x2_t vtltr3 = vld1_f32(itl3);
const float32x2_t vblbr3 = vld1_f32(ibl3);
const float32x4_t valphah = vw.val[0];
const float32x4_t valphav = vw.val[1];
const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
const float32x4_t vld = vld_t.val[0];
const float32x4_t vrd = vld_t.val[1];
const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
const float32x4_t vtl = vtl_t.val[0];
const float32x4_t vtr = vtl_t.val[1];
const float32x4_t vl = vfmaq_f32(vtl, vld, valphav);
const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav);
const float32x4_t vd = vsubq_f32(vr, vl);
const float32x4_t vo = vfmaq_f32(vl, vd, valphah);
vst1q_f32(output, vo);
output += 4;
}
if XNN_UNLIKELY(p != 0) {
if (p & 2) {
const float32x2x2_t vw = vld2_f32(w);
w += 4;
const float32x2_t valphah = vw.val[0];
const float32x2_t valphav = vw.val[1];
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
const float32x2_t vtltr0 = vld1_f32(itl0);
const float32x2_t vblbr0 = vld1_f32(ibl0);
const float32x2_t vtltr1 = vld1_f32(itl1);
const float32x2_t vblbr1 = vld1_f32(ibl1);
const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
const float32x2_t vld = vld_t.val[0];
const float32x2_t vrd = vld_t.val[1];
const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
const float32x2_t vtl = vtl_t.val[0];
const float32x2_t vtr = vtl_t.val[1];
const float32x2_t vl = vfma_f32(vtl, vld, valphav);
const float32x2_t vr = vfma_f32(vtr, vrd, valphav);
const float32x2_t vd = vsub_f32(vr, vl);
const float32x2_t vo = vfma_f32(vl, vd, valphah);
vst1_f32(output, vo);
output += 2;
}
if (p & 1) {
// We are computing the following formula:
// result = (1 - alpha_h) * (1 - alpha_v) * top_left +
// alpha_h * (1 - alpha_v) * top_right +
// (1 - alpha_h) * alpha_v * bottom_left +
// alpha_h * alpha_v * bottom_right.
//
// Rearranging gives
// result = left + alpha_h * (right - left),
// where
// left = top_left + alpha_v * (bottom_left - top_left),
// right = top_right + alpha_v * (bottom_right - top_right).
const float alphah = *w;
const float32x2_t valphav = vld1_dup_f32(w + 1);
w += 2;
const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
i += 2;
const float32x2_t vtltr = vld1_f32(itl);
const float32x2_t vblbr = vld1_f32(ibl);
// Compute at once
// left_diff = bottom_left - top_left
// right_diff = bottom_right - top_right
const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav);
// Extract them and compute the result.
const float l = vget_lane_f32(vlr, 0);
const float r = vget_lane_f32(vlr, 1);
*output++ = l + alphah * (r - l);
}
}
input_offset += input_increment;
} while (--channels != 0);
}
void xnn_f32_ibilinear_ukernel__neonfma_c8(
size_t output_pixels,
size_t channels,
const float** restrict input,
size_t input_offset,
const float* restrict weights,
float* restrict output,
size_t output_increment) XNN_OOB_READS
{
assert(output_pixels != 0);
assert(channels != 0);
assert(channels % sizeof(float) == 0);
do {
const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset);
const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset);
const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset);
const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset);
input += 4;
const float32x2_t valphahv = vld1_f32(weights); weights += 2;
#if XNN_ARCH_ARM
const float32x4_t valphah = vdupq_lane_f32(valphahv, 0);
const float32x4_t valphav = vdupq_lane_f32(valphahv, 1);
#endif
size_t c = channels;
for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vtl4567 = vld1q_f32(i0); i0 += 4;
const float32x4_t vtr4567 = vld1q_f32(i1); i1 += 4;
const float32x4_t vbl4567 = vld1q_f32(i2); i2 += 4;
const float32x4_t vbr4567 = vld1q_f32(i3); i3 += 4;
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);
const float32x4_t vtd4567 = vsubq_f32(vtr4567, vtl4567);
const float32x4_t vbd4567 = vsubq_f32(vbr4567, vbl4567);
#if XNN_ARCH_ARM
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah);
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah);
const float32x4_t vt4567 = vfmaq_f32(vtl4567, vtd4567, valphah);
const float32x4_t vb4567 = vfmaq_f32(vbl4567, vbd4567, valphah);
#else
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0);
const float32x4_t vt4567 = vfmaq_lane_f32(vtl4567, vtd4567, valphahv, 0);
const float32x4_t vb4567 = vfmaq_lane_f32(vbl4567, vbd4567, valphahv, 0);
#endif
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);
const float32x4_t vd4567 = vsubq_f32(vb4567, vt4567);
#if XNN_ARCH_ARM
const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav);
const float32x4_t vo4567 = vfmaq_f32(vt4567, vd4567, valphav);
#else
const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1);
const float32x4_t vo4567 = vfmaq_lane_f32(vt4567, vd4567, valphahv, 1);
#endif
vst1q_f32(output, vo0123); output += 4;
vst1q_f32(output, vo4567); output += 4;
}
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4;
const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4;
const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4;
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);
#if XNN_ARCH_ARM
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah);
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah);
#else
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0);
#endif
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);
#if XNN_ARCH_ARM
const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav);
#else
const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1);
#endif
vst1q_f32(output, vo0123);
output += 4;
}
if XNN_UNLIKELY(c != 0) {
const float32x4_t vtl0123 = vld1q_f32(i0);
const float32x4_t vtr0123 = vld1q_f32(i1);
const float32x4_t vbl0123 = vld1q_f32(i2);
const float32x4_t vbr0123 = vld1q_f32(i3);
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123);
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123);
#if XNN_ARCH_ARM
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah);
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah);
#else
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0);
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0);
#endif
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123);
#if XNN_ARCH_ARM
float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav);
#else
float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1);
#endif
float32x2_t vo01 = vget_low_f32(vo0123);
if (c & (2 * sizeof(float))) {
vst1_f32(output, vo01); output += 2;
vo01 = vget_high_f32(vo0123);
}
if (c & (1 * sizeof(float))) {
vst1_lane_f32(output, vo01, 0); output += 1;
}
}
output = (float*) ((uintptr_t) output + output_increment);
} while (--output_pixels != 0);
}
void xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (1 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
a += 1;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w + 0);
const float32x4_t vb4567c0 = vld1q_f32(w + 4);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c1 = vld1q_f32(w + 8);
const float32x4_t vb4567c1 = vld1q_f32(w + 12);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c2 = vld1q_f32(w + 16);
const float32x4_t vb4567c2 = vld1q_f32(w + 20);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c3 = vld1q_f32(w + 24);
const float32x4_t vb4567c3 = vld1q_f32(w + 28);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
w += 32;
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
}
p -= 1 * sizeof(void*);
} while (p != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c0, vacc0x01); c0 += 2;
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (4 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
c1 = c0;
}
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
c2 = c1;
}
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
c3 = c2;
}
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc1x0123 = vacc0x0123;
float32x4_t vacc1x4567 = vacc0x4567;
float32x4_t vacc2x0123 = vacc0x0123;
float32x4_t vacc2x4567 = vacc0x4567;
float32x4_t vacc3x0123 = vacc0x0123;
float32x4_t vacc3x4567 = vacc0x4567;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
const float* restrict a1 = a[1];
assert(a1 != NULL);
if XNN_UNPREDICTABLE(a1 != zero) {
a1 = (const float*) ((uintptr_t) a1 + a_offset);
}
const float* restrict a2 = a[2];
assert(a2 != NULL);
if XNN_UNPREDICTABLE(a2 != zero) {
a2 = (const float*) ((uintptr_t) a2 + a_offset);
}
const float* restrict a3 = a[3];
assert(a3 != NULL);
if XNN_UNPREDICTABLE(a3 != zero) {
a3 = (const float*) ((uintptr_t) a3 + a_offset);
}
a += 4;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
float32x4_t va1 = vld1q_f32(a1); a1 += 4;
float32x4_t va2 = vld1q_f32(a2); a2 += 4;
float32x4_t va3 = vld1q_f32(a3); a3 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w + 0);
const float32x4_t vb4567c0 = vld1q_f32(w + 4);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c1 = vld1q_f32(w + 8);
const float32x4_t vb4567c1 = vld1q_f32(w + 12);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c2 = vld1q_f32(w + 16);
const float32x4_t vb4567c2 = vld1q_f32(w + 20);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c3 = vld1q_f32(w + 24);
const float32x4_t vb4567c3 = vld1q_f32(w + 28);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3);
w += 32;
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k);
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k);
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0);
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0);
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0);
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0);
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1);
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1);
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1);
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1);
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2);
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2);
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2);
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2);
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3);
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3);
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3);
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3);
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3);
}
p -= 4 * sizeof(void*);
} while (p != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
vacc2x0123 = vminq_f32(vacc2x0123, vmax);
vacc3x0123 = vminq_f32(vacc3x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
vacc1x4567 = vminq_f32(vacc1x4567, vmax);
vacc2x4567 = vminq_f32(vacc2x4567, vmax);
vacc3x4567 = vminq_f32(vacc3x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c3, vacc3x0123);
vst1q_f32(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
vst1q_f32(c2, vacc2x0123);
vst1q_f32(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
vst1q_f32(c1, vacc1x0123);
vst1q_f32(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c3, vacc3x0123); c3 += 4;
vst1q_f32(c2, vacc2x0123); c2 += 4;
vst1q_f32(c1, vacc1x0123); c1 += 4;
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c3, vacc3x01); c3 += 2;
vst1_f32(c2, vacc2x01); c2 += 2;
vst1_f32(c1, vacc1x01); c1 += 2;
vst1_f32(c0, vacc0x01); c0 += 2;
vacc3x01 = vget_high_f32(vacc3x0123);
vacc2x01 = vget_high_f32(vacc2x0123);
vacc1x01 = vget_high_f32(vacc1x0123);
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c3, vacc3x01, 0);
vst1_lane_f32(c2, vacc2x01, 0);
vst1_lane_f32(c1, vacc1x01, 0);
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
void xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
const float** restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
const float* zero,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 6);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);
assert(ks != 0);
assert(ks % (6 * sizeof(void*)) == 0);
assert(a_offset % sizeof(float) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
float* c0 = c;
float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
c1 = c0;
}
float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
c2 = c1;
}
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr < 4) {
c3 = c2;
}
float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 4) {
c4 = c3;
}
float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
if XNN_UNPREDICTABLE(mr != 6) {
c5 = c4;
}
do {
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
float32x4_t vacc1x0123 = vacc0x0123;
float32x4_t vacc1x4567 = vacc0x4567;
float32x4_t vacc2x0123 = vacc0x0123;
float32x4_t vacc2x4567 = vacc0x4567;
float32x4_t vacc3x0123 = vacc0x0123;
float32x4_t vacc3x4567 = vacc0x4567;
float32x4_t vacc4x0123 = vacc0x0123;
float32x4_t vacc4x4567 = vacc0x4567;
float32x4_t vacc5x0123 = vacc0x0123;
float32x4_t vacc5x4567 = vacc0x4567;
size_t p = ks;
do {
const float* restrict a0 = a[0];
assert(a0 != NULL);
if XNN_UNPREDICTABLE(a0 != zero) {
a0 = (const float*) ((uintptr_t) a0 + a_offset);
}
const float* restrict a1 = a[1];
assert(a1 != NULL);
if XNN_UNPREDICTABLE(a1 != zero) {
a1 = (const float*) ((uintptr_t) a1 + a_offset);
}
const float* restrict a2 = a[2];
assert(a2 != NULL);
if XNN_UNPREDICTABLE(a2 != zero) {
a2 = (const float*) ((uintptr_t) a2 + a_offset);
}
const float* restrict a3 = a[3];
assert(a3 != NULL);
if XNN_UNPREDICTABLE(a3 != zero) {
a3 = (const float*) ((uintptr_t) a3 + a_offset);
}
const float* restrict a4 = a[4];
assert(a4 != NULL);
if XNN_UNPREDICTABLE(a4 != zero) {
a4 = (const float*) ((uintptr_t) a4 + a_offset);
}
const float* restrict a5 = a[5];
assert(a5 != NULL);
if XNN_UNPREDICTABLE(a5 != zero) {
a5 = (const float*) ((uintptr_t) a5 + a_offset);
}
a += 6;
size_t k = kc;
while (k >= 4 * sizeof(float)) {
float32x4_t va0 = vld1q_f32(a0); a0 += 4;
float32x4_t va1 = vld1q_f32(a1); a1 += 4;
float32x4_t va2 = vld1q_f32(a2); a2 += 4;
float32x4_t va3 = vld1q_f32(a3); a3 += 4;
float32x4_t va4 = vld1q_f32(a4); a4 += 4;
float32x4_t va5 = vld1q_f32(a5); a5 += 4;
const float32x4_t vb0123c0 = vld1q_f32(w + 0);
const float32x4_t vb4567c0 = vld1q_f32(w + 4);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c1 = vld1q_f32(w + 8);
const float32x4_t vb4567c1 = vld1q_f32(w + 12);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c2 = vld1q_f32(w + 16);
const float32x4_t vb4567c2 = vld1q_f32(w + 20);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c3 = vld1q_f32(w + 24);
const float32x4_t vb4567c3 = vld1q_f32(w + 28);
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3);
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3);
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3);
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3);
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3);
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3);
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3);
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3);
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3);
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3);
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3);
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3);
w += 32;
k -= 4 * sizeof(float);
}
if XNN_UNLIKELY(k != 0) {
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k);
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k);
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k);
float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k);
float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k);
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0);
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0);
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0);
const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0);
const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0);
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0);
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0);
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0);
const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0);
const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1);
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1);
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1);
const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1);
const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1);
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1);
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1);
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1);
const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1);
const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2);
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2);
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2);
const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2);
const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2);
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2);
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2);
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2);
const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2);
const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2);
va0 = vextq_f32(va0, va0, 1);
va1 = vextq_f32(va1, va1, 1);
va2 = vextq_f32(va2, va2, 1);
va3 = vextq_f32(va3, va3, 1);
va4 = vextq_f32(va4, va4, 1);
va5 = vextq_f32(va5, va5, 1);
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3);
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3);
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3);
const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3);
const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3);
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3);
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3);
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3);
const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3);
const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3);
}
p -= 6 * sizeof(void*);
} while (p != 0);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
vacc2x0123 = vminq_f32(vacc2x0123, vmax);
vacc3x0123 = vminq_f32(vacc3x0123, vmax);
vacc4x0123 = vminq_f32(vacc4x0123, vmax);
vacc5x0123 = vminq_f32(vacc5x0123, vmax);
vacc0x4567 = vminq_f32(vacc0x4567, vmax);
vacc1x4567 = vminq_f32(vacc1x4567, vmax);
vacc2x4567 = vminq_f32(vacc2x4567, vmax);
vacc3x4567 = vminq_f32(vacc3x4567, vmax);
vacc4x4567 = vminq_f32(vacc4x4567, vmax);
vacc5x4567 = vminq_f32(vacc5x4567, vmax);
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
if XNN_LIKELY(nc >= 8) {
vst1q_f32(c5, vacc5x0123);
vst1q_f32(c5 + 4, vacc5x4567);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
vst1q_f32(c4, vacc4x0123);
vst1q_f32(c4 + 4, vacc4x4567);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
vst1q_f32(c3, vacc3x0123);
vst1q_f32(c3 + 4, vacc3x4567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
vst1q_f32(c2, vacc2x0123);
vst1q_f32(c2 + 4, vacc2x4567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
vst1q_f32(c1, vacc1x0123);
vst1q_f32(c1 + 4, vacc1x4567);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
vst1q_f32(c0, vacc0x0123);
vst1q_f32(c0 + 4, vacc0x4567);
c0 = (float*) ((uintptr_t) c0 + cn_stride);
a = (const float**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
if (nc & 4) {
vst1q_f32(c5, vacc5x0123); c5 += 4;
vst1q_f32(c4, vacc4x0123); c4 += 4;
vst1q_f32(c3, vacc3x0123); c3 += 4;
vst1q_f32(c2, vacc2x0123); c2 += 4;
vst1q_f32(c1, vacc1x0123); c1 += 4;
vst1q_f32(c0, vacc0x0123); c0 += 4;
vacc5x0123 = vacc5x4567;
vacc4x0123 = vacc4x4567;
vacc3x0123 = vacc3x4567;
vacc2x0123 = vacc2x4567;
vacc1x0123 = vacc1x4567;
vacc0x0123 = vacc0x4567;
}
float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
if (nc & 2) {
vst1_f32(c5, vacc5x01); c5 += 2;
vst1_f32(c4, vacc4x01); c4 += 2;
vst1_f32(c3, vacc3x01); c3 += 2;
vst1_f32(c2, vacc2x01); c2 += 2;
vst1_f32(c1, vacc1x01); c1 += 2;
vst1_f32(c0, vacc0x01); c0 += 2;
vacc5x01 = vget_high_f32(vacc5x0123);
vacc4x01 = vget_high_f32(vacc4x0123);
vacc3x01 = vget_high_f32(vacc3x0123);
vacc2x01 = vget_high_f32(vacc2x0123);
vacc1x01 = vget_high_f32(vacc1x0123);
vacc0x01 = vget_high_f32(vacc0x0123);
}
if (nc & 1) {
vst1_lane_f32(c5, vacc5x01, 0);
vst1_lane_f32(c4, vacc4x01, 0);
vst1_lane_f32(c3, vacc3x01, 0);
vst1_lane_f32(c2, vacc2x01, 0);
vst1_lane_f32(c1, vacc1x01, 0);
vst1_lane_f32(c0, vacc0x01, 0);
}
nc = 0;
}
} while (nc != 0);
}
extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64];
void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16(
size_t batch,
const float* input,
const float* max,
float* output,
float* sum,
const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
assert(max != NULL);
assert(output != NULL);
assert(sum != NULL);
const float32x4_t vi_max = vld1q_dup_f32(max);
const float32x4_t vlog2e = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.log2e);
const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.magic_bias);
const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F));
const float32x4_t vminus_ln2 = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.minus_ln2);
const float32x4_t vc2 = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.c2);
const float32x4_t vdenorm_cutoff = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.denorm_cutoff);
float32x4_t vacc0 = vmovq_n_f32(0.0f);
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
const float32x4_t vi0123 = vld1q_f32(input); input += 4;
const float32x4_t vi4567 = vld1q_f32(input); input += 4;
const float32x4_t vi89AB = vld1q_f32(input); input += 4;
const float32x4_t viCDEF = vld1q_f32(input); input += 4;
const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max);
const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max);
const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max);
const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max);
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e);
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e);
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e);
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e);
const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17);
const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17);
const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17);
const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17);
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]);
float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]);
float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]);
float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]);
float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]);
float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]);
float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]);
float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]);
vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1);
vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1);
const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1);
vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1);
const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1);
vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1);
const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1);
vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1);
const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
vn0123 = vsubq_f32(vn0123, vmagic_bias);
vn4567 = vsubq_f32(vn4567, vmagic_bias);
vn89AB = vsubq_f32(vn89AB, vmagic_bias);
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2);
float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2);
float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2);
float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2);
float32x4_t vp0123 = vmulq_f32(vt0123, vc2);
float32x4_t vp4567 = vmulq_f32(vt4567, vc2);
float32x4_t vp89AB = vmulq_f32(vt89AB, vc2);
float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2);
vp0123 = vfmaq_f32(vt0123, vt0123, vp0123);
vp4567 = vfmaq_f32(vt4567, vt4567, vp4567);
vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB);
vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF);
float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123);
float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567);
float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff)));
vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff)));
vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff)));
vst1q_f32(output, vf0123); output += 4;
vst1q_f32(output, vf4567); output += 4;
vst1q_f32(output, vf89AB); output += 4;
vst1q_f32(output, vfCDEF); output += 4;
vacc0 = vaddq_f32(vacc0, vf0123);
vacc0 = vaddq_f32(vacc0, vf4567);
vacc0 = vaddq_f32(vacc0, vf89AB);
vacc0 = vaddq_f32(vacc0, vfCDEF);
}
float32x4_t vacc = vacc0;
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
const float32x4_t vi = vld1q_f32(input); input += 4;
const float32x4_t vx = vsubq_f32(vi, vi_max);
float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e);
const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17);
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]);
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]);
vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2);
float32x4_t vp = vmulq_f32(vt, vc2);
vp = vfmaq_f32(vt, vt, vp);
float32x4_t vf = vfmaq_f32(vs, vs, vp);
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
vst1q_f32(output, vf); output += 4;
vacc = vaddq_f32(vacc, vf);
}
#if XNN_ARCH_ARM64
float vacc_lo = vaddvq_f32(vacc);
#else
float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc));
#endif
if (batch != 0) {
assert(batch >= 1 * sizeof(float));
assert(batch <= 3 * sizeof(float));
const float32x4_t vi = vld1q_f32(input); input += 4;
const float32x4_t vx = vsubq_f32(vi, vi_max);
float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e);
const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17);
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]);
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]);
vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2);
float32x4_t vp = vmulq_f32(vt, vc2);
vp = vfmaq_f32(vt, vt, vp);
float32x4_t vf = vfmaq_f32(vs, vs, vp);
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
float32x2_t vf_lo = vget_low_f32(vf);
if (batch & (2 * sizeof(float))) {
vst1_f32(output, vf_lo); output += 2;
#if XNN_ARCH_ARM64
vacc_lo += vaddv_f32(vf_lo);
#else
vacc_lo = vadd_f32(vacc_lo, vf_lo);
#endif
vf_lo = vget_high_f32(vf);
}
if (batch & (1 * sizeof(float))) {
vst1_lane_f32(output, vf_lo, 0);
#if XNN_ARCH_ARM64
vacc_lo += vget_lane_f32(vf_lo, 0);
#else
vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32)));
#endif
}
}
#if XNN_ARCH_ARM64
*sum = vacc_lo;
#else
vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0);
#endif
}
void xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined(
size_t mc,
size_t nc,
const float* input,
const float* weights,
const int32_t* widx_dmap,
const uint32_t* nidx_nnzmap,
float* output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(mc != 0);
assert(mc % sizeof(float) == 0);
assert(nc != 0);
#if XNN_ARCH_ARM64
const float32x4x2_t vminmax = vld2q_dup_f32(&params->scalar.min);
const float32x4_t vmin = vminmax.val[0];
const float32x4_t vmax = vminmax.val[1];
#else
const float32x2x2_t vminmax = vld2_dup_f32(&params->scalar.min);
const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]);
const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]);
#endif
size_t output_decrement = output_stride * nc - 32 * sizeof(float);
while XNN_LIKELY(mc >= 32 * sizeof(float)) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
float32x4_t vw = vld1q_dup_f32(w); w += 1;
intptr_t diff = *dmap++;
float32x4_t vi0123 = vld1q_f32(input);
float32x4_t vi4567 = vld1q_f32(input + 4);
float32x4_t vi89AB = vld1q_f32(input + 8);
float32x4_t viCDEF = vld1q_f32(input + 12);
float32x4_t viGHIJ = vld1q_f32(input + 16);
float32x4_t viKLMN = vld1q_f32(input + 20);
float32x4_t viOPQR = vld1q_f32(input + 24);
float32x4_t viSTUV = vld1q_f32(input + 28);
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x4_t vacc0123 = vw;
float32x4_t vacc4567 = vw;
float32x4_t vacc89AB = vw;
float32x4_t vaccCDEF = vw;
float32x4_t vaccGHIJ = vw;
float32x4_t vaccKLMN = vw;
float32x4_t vaccOPQR = vw;
float32x4_t vaccSTUV = vw;
vw = vld1q_dup_f32(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
vaccGHIJ = vfmaq_f32(vaccGHIJ, viGHIJ, vw);
vaccKLMN = vfmaq_f32(vaccKLMN, viKLMN, vw);
vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw);
vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
diff = *dmap++;
vw = vld1q_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vi0123 = vld1q_f32(input);
vi4567 = vld1q_f32(input + 4);
vi89AB = vld1q_f32(input + 8);
viCDEF = vld1q_f32(input + 12);
viGHIJ = vld1q_f32(input + 16);
viKLMN = vld1q_f32(input + 20);
viOPQR = vld1q_f32(input + 24);
viSTUV = vld1q_f32(input + 28);
} while (--nnz != 0);
}
float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax);
float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax);
float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax);
float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax);
vout0123 = vmaxq_f32(vout0123, vmin);
vout4567 = vmaxq_f32(vout4567, vmin);
vout89AB = vmaxq_f32(vout89AB, vmin);
voutCDEF = vmaxq_f32(voutCDEF, vmin);
voutGHIJ = vmaxq_f32(voutGHIJ, vmin);
voutKLMN = vmaxq_f32(voutKLMN, vmin);
voutOPQR = vmaxq_f32(voutOPQR, vmin);
voutSTUV = vmaxq_f32(voutSTUV, vmin);
vst1q_f32(output, vout0123);
vst1q_f32(output + 4, vout4567);
vst1q_f32(output + 8, vout89AB);
vst1q_f32(output + 12, voutCDEF);
vst1q_f32(output + 16, voutGHIJ);
vst1q_f32(output + 20, voutKLMN);
vst1q_f32(output + 24, voutOPQR);
vst1q_f32(output + 28, voutSTUV);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 32;
mc -= 32 * sizeof(float);
}
if XNN_UNLIKELY(mc != 0) {
output_decrement += 16 * sizeof(float);
if (mc & (16 * sizeof(float))) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
float32x4_t vacc4567 = vacc0123;
float32x4_t vacc89AB = vacc0123;
float32x4_t vaccCDEF = vacc0123;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const float32x4_t vi0123 = vld1q_f32(input);
const float32x4_t vi4567 = vld1q_f32(input + 4);
const float32x4_t vi89AB = vld1q_f32(input + 8);
const float32x4_t viCDEF = vld1q_f32(input + 12);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
const float32x4_t vb = vld1q_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
vacc4567 = vfmaq_f32(vacc4567, vi4567, vb);
vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vb);
vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vb);
} while (--nnz != 0);
}
float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
vout0123 = vmaxq_f32(vout0123, vmin);
vout4567 = vmaxq_f32(vout4567, vmin);
vout89AB = vmaxq_f32(vout89AB, vmin);
voutCDEF = vmaxq_f32(voutCDEF, vmin);
vst1q_f32(output, vout0123);
vst1q_f32(output + 4, vout4567);
vst1q_f32(output + 8, vout89AB);
vst1q_f32(output + 12, voutCDEF);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 16;
}
output_decrement += 8 * sizeof(float);
if (mc & (8 * sizeof(float))) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
float32x4_t vacc4567 = vacc0123;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const float32x4_t vi0123 = vld1q_f32(input);
const float32x4_t vi4567 = vld1q_f32(input + 4);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
const float32x4_t vb = vld1q_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
vacc4567 = vfmaq_f32(vacc4567, vi4567, vb);
} while (--nnz != 0);
}
float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
vout0123 = vmaxq_f32(vout0123, vmin);
vout4567 = vmaxq_f32(vout4567, vmin);
vst1q_f32(output, vout0123);
vst1q_f32(output + 4, vout4567);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 8;
}
output_decrement += 4 * sizeof(float);
if (mc & (4 * sizeof(float))) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const float32x4_t vi0123 = vld1q_f32(input);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
const float32x4_t vb = vld1q_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb);
} while (--nnz != 0);
}
float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
vout0123 = vmaxq_f32(vout0123, vmin);
vst1q_f32(output, vout0123);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 4;
}
output_decrement += 2 * sizeof(float);
if (mc & (2 * sizeof(float))) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const float32x2_t vi01 = vld1_f32(input);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
const float32x2_t vb = vld1_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vacc01 = vfma_f32(vacc01, vi01, vb);
} while (--nnz != 0);
}
float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
vout01 = vmax_f32(vout01, vget_low_f32(vmin));
vst1_f32(output, vout01);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 2;
}
output_decrement += 1 * sizeof(float);
if (mc & (1 * sizeof(float))) {
const float* w = weights;
const int32_t* dmap = widx_dmap;
const uint32_t* nnzmap = nidx_nnzmap;
size_t n = nc;
do {
uint32_t nnz = *nnzmap++;
float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
if XNN_LIKELY(nnz != 0) {
do {
const intptr_t diff = *dmap++;
const float32x2_t vi0 = vld1_dup_f32(input);
input = (const float*) ((uintptr_t) input + (uintptr_t) diff);
xnn_prefetch_to_l1(input + 16);
xnn_prefetch_to_l1(input + 32);
const float32x2_t vb = vld1_dup_f32(w); w += 1;
xnn_prefetch_to_l1(w + 32);
vacc0 = vfma_f32(vacc0, vi0, vb);
} while (--nnz != 0);
}
float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
vout0 = vmax_f32(vout0, vget_low_f32(vmin));
vst1_lane_f32(output, vout0, 0);
output = (float*) ((uintptr_t) output + output_stride);
} while (--n != 0);
output = (float*) ((uintptr_t) output - output_decrement);
input += 1;
}
}
}
extern XNN_INTERNAL const int32_t xnn_table_exp2minus_k_over_16[16];
void xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
assert(output != NULL);
const float32x4_t vprescale = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.prescale);
const float32x4_t valpha = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.alpha);
const float32x4_t vbeta = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.beta);
const float32x4_t vsat_cutoff = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.sat_cutoff);
const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.magic_bias);
const float32x4_t vlog2e = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.log2e);
const int32x4_t vindex_mask = vmovq_n_s32(0xF);
const float32x4_t vminus_ln2 = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.minus_ln2);
const float32x4_t vc3 = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.c3);
const float32x4_t vc2 = vld1q_dup_f32(&params->neonfma_rr1_lut16_p3.c2);
const float32x4_t vone = vmovq_n_f32(1.0f);
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
float32x4_t vx0123 = vld1q_f32(input); input += 4;
float32x4_t vx4567 = vld1q_f32(input); input += 4;
float32x4_t vx89AB = vld1q_f32(input); input += 4;
float32x4_t vxCDEF = vld1q_f32(input); input += 4;
const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff);
const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff);
const float32x4_t vz89AB = vmaxq_f32(vmulq_f32(vx89AB, vprescale), vsat_cutoff);
const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff);
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e);
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e);
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vlog2e);
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vlog2e);
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask), 2));
const int32x4_t ven0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 19);
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask), 2));
const int32x4_t ven4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 19);
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask), 2));
const int32x4_t ven89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 19);
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask), 2));
const int32x4_t venCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 19);
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
int32x2_t vl01 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01));
int32x2_t vl23 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23));
vl01 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)), vl01, 1);
vl23 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)), vl23, 1);
const int32x4_t vl0123 = vcombine_s32(vl01, vl23);
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
int32x2_t vl45 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45));
int32x2_t vl67 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67));
vl45 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)), vl45, 1);
vl67 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)), vl67, 1);
const int32x4_t vl4567 = vcombine_s32(vl45, vl67);
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
int32x2_t vl89 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx89));
int32x2_t vlAB = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxAB));
vl89 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx89 >> 32)), vl89, 1);
vlAB = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxAB >> 32)), vlAB, 1);
const int32x4_t vl89AB = vcombine_s32(vl89, vlAB);
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
int32x2_t vlCD = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxCD));
int32x2_t vlEF = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxEF));
vlCD = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxCD >> 32)), vlCD, 1);
vlEF = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxEF >> 32)), vlEF, 1);
const int32x4_t vlCDEF = vcombine_s32(vlCD, vlEF);
vn0123 = vsubq_f32(vn0123, vmagic_bias);
float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vl0123, ven0123));
vn4567 = vsubq_f32(vn4567, vmagic_bias);
float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vl4567, ven4567));
vn89AB = vsubq_f32(vn89AB, vmagic_bias);
float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vl89AB, ven89AB));
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vlCDEF, venCDEF));
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2);
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2);
float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vminus_ln2);
float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vminus_ln2);
float32x4_t vp0123 = vfmaq_f32(vc2, vc3, vt0123);
float32x4_t vp4567 = vfmaq_f32(vc2, vc3, vt4567);
float32x4_t vp89AB = vfmaq_f32(vc2, vc3, vt89AB);
float32x4_t vpCDEF = vfmaq_f32(vc2, vc3, vtCDEF);
vp0123 = vmulq_f32(vp0123, vt0123);
vp4567 = vmulq_f32(vp4567, vt4567);
vp89AB = vmulq_f32(vp89AB, vt89AB);
vpCDEF = vmulq_f32(vpCDEF, vtCDEF);
vt0123 = vmulq_f32(vt0123, vs0123);
vs0123 = vsubq_f32(vs0123, vone);
vt4567 = vmulq_f32(vt4567, vs4567);
vs4567 = vsubq_f32(vs4567, vone);
vt89AB = vmulq_f32(vt89AB, vs89AB);
vs89AB = vsubq_f32(vs89AB, vone);
vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
vsCDEF = vsubq_f32(vsCDEF, vone);
vp0123 = vfmaq_f32(vt0123, vp0123, vt0123);
vp4567 = vfmaq_f32(vt4567, vp4567, vt4567);
vp89AB = vfmaq_f32(vt89AB, vp89AB, vt89AB);
vpCDEF = vfmaq_f32(vtCDEF, vpCDEF, vtCDEF);
const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha);
const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha);
const float32x4_t ve89AB = vmulq_f32(vaddq_f32(vp89AB, vs89AB), valpha);
const float32x4_t veCDEF = vmulq_f32(vaddq_f32(vpCDEF, vsCDEF), valpha);
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
vx0123 = vmulq_f32(vx0123, vbeta);
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
vx4567 = vmulq_f32(vx4567, vbeta);
const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
vx89AB = vmulq_f32(vx89AB, vbeta);
const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
vxCDEF = vmulq_f32(vxCDEF, vbeta);
const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123);
const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567);
const float32x4_t vy89AB = vbslq_f32(vm89AB, ve89AB, vx89AB);
const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF);
vst1q_f32(output, vy0123); output += 4;
vst1q_f32(output, vy4567); output += 4;
vst1q_f32(output, vy89AB); output += 4;
vst1q_f32(output, vyCDEF); output += 4;
}
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
float32x4_t vx = vld1q_f32(input); input += 4;
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e);
const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2));
const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19);
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
vn = vsubq_f32(vn, vmagic_bias);
const int32x4_t vl = vcombine_s32(vl_lo, vl_hi);
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2);
float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven));
float32x4_t vp = vfmaq_f32(vc2, vc3, vt);
vp = vmulq_f32(vp, vt);
vt = vmulq_f32(vt, vs);
vs = vsubq_f32(vs, vone);
vp = vfmaq_f32(vt, vp, vt);
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vx = vmulq_f32(vx, vbeta);
const float32x4_t vy = vbslq_f32(vm, ve, vx);
vst1q_f32(output, vy); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
float32x4_t vx = vld1q_f32(input);
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e);
const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2));
const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19);
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1);
vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1);
vn = vsubq_f32(vn, vmagic_bias);
const int32x4_t vl = vcombine_s32(vl_lo, vl_hi);
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2);
float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven));
float32x4_t vp = vfmaq_f32(vc2, vc3, vt);
vp = vmulq_f32(vp, vt);
vt = vmulq_f32(vt, vs);
vs = vsubq_f32(vs, vone);
vp = vfmaq_f32(vt, vp, vt);
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vx = vmulq_f32(vx, vbeta);
const float32x4_t vy = vbslq_f32(vm, ve, vx);
float32x2_t vy_lo = vget_low_f32(vy);
if (batch & (2 * sizeof(float))) {
vst1_f32(output, vy_lo); output += 2;
vy_lo = vget_high_f32(vy);
}
if (batch & (1 * sizeof(float))) {
vst1_lane_f32(output, vy_lo, 0);
}
}
}
void xnn_f32_velu_ukernel__neonfma_rr1_p6_x8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
assert(output != NULL);
const float32x4_t vprescale = vld1q_dup_f32(&params->neonfma_rr1_p6.prescale);
const float32x4_t valpha = vld1q_dup_f32(&params->neonfma_rr1_p6.alpha);
const float32x4_t vbeta = vld1q_dup_f32(&params->neonfma_rr1_p6.beta);
const float32x4_t vsat_cutoff = vld1q_dup_f32(&params->neonfma_rr1_p6.sat_cutoff);
const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neonfma_rr1_p6.magic_bias);
const float32x4_t vlog2e = vld1q_dup_f32(&params->neonfma_rr1_p6.log2e);
const float32x4_t vminus_ln2 = vld1q_dup_f32(&params->neonfma_rr1_p6.minus_ln2);
const float32x4_t vc6 = vld1q_dup_f32(&params->neonfma_rr1_p6.c6);
const float32x4_t vc5 = vld1q_dup_f32(&params->neonfma_rr1_p6.c5);
const float32x4_t vc4 = vld1q_dup_f32(&params->neonfma_rr1_p6.c4);
const float32x4_t vc3 = vld1q_dup_f32(&params->neonfma_rr1_p6.c3);
const float32x4_t vc2 = vld1q_dup_f32(&params->neonfma_rr1_p6.c2);
const float32x4_t vone = vmovq_n_f32(1.0f);
for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
float32x4_t vx0123 = vld1q_f32(input); input += 4;
float32x4_t vx4567 = vld1q_f32(input); input += 4;
const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff);
const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff);
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e);
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e);
float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
vn0123 = vsubq_f32(vn0123, vmagic_bias);
float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
vn4567 = vsubq_f32(vn4567, vmagic_bias);
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2);
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2);
float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123);
float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567);
vp0123 = vfmaq_f32(vc4, vp0123, vt0123);
vp4567 = vfmaq_f32(vc4, vp4567, vt4567);
vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
vp0123 = vmulq_f32(vp0123, vt0123);
vp4567 = vmulq_f32(vp4567, vt4567);
vt0123 = vmulq_f32(vt0123, vs0123);
vs0123 = vsubq_f32(vs0123, vone);
vt4567 = vmulq_f32(vt4567, vs4567);
vs4567 = vsubq_f32(vs4567, vone);
vp0123 = vfmaq_f32(vt0123, vp0123, vt0123);
vp4567 = vfmaq_f32(vt4567, vp4567, vt4567);
const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha);
const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha);
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
vx0123 = vmulq_f32(vx0123, vbeta);
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
vx4567 = vmulq_f32(vx4567, vbeta);
const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123);
const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567);
vst1q_f32(output, vy0123); output += 4;
vst1q_f32(output, vy4567); output += 4;
}
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
float32x4_t vx = vld1q_f32(input); input += 4;
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e);
float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2);
float32x4_t vp = vfmaq_f32(vc5, vc6, vt);
vp = vfmaq_f32(vc4, vp, vt);
vp = vfmaq_f32(vc3, vp, vt);
vp = vfmaq_f32(vc2, vp, vt);
vp = vmulq_f32(vp, vt);
vt = vmulq_f32(vt, vs);
vs = vsubq_f32(vs, vone);
vp = vfmaq_f32(vt, vp, vt);
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vx = vmulq_f32(vx, vbeta);
const float32x4_t vy = vbslq_f32(vm, ve, vx);
vst1q_f32(output, vy); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
float32x4_t vx = vld1q_f32(input);
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e);
float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2);
float32x4_t vp = vfmaq_f32(vc5, vc6, vt);
vp = vfmaq_f32(vc4, vp, vt);
vp = vfmaq_f32(vc3, vp, vt);
vp = vfmaq_f32(vc2, vp, vt);
vp = vmulq_f32(vp, vt);
vt = vmulq_f32(vt, vs);
vs = vsubq_f32(vs, vone);
vp = vfmaq_f32(vt, vp, vt);
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha);
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vx = vmulq_f32(vx, vbeta);
const float32x4_t vy = vbslq_f32(vm, ve, vx);
float32x2_t vy_lo = vget_low_f32(vy);
if (batch & (2 * sizeof(float))) {
vst1_f32(output, vy_lo); output += 2;
vy_lo = vget_high_f32(vy);
}
if (batch & (1 * sizeof(float))) {
vst1_lane_f32(output, vy_lo, 0);
}
}
}
void xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x(
size_t rows,
size_t channels,
const float* restrict input,
size_t input_stride,
const float* restrict weights,
float* restrict output,
size_t output_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(rows != 0);
assert(channels != 0);
assert(channels % sizeof(float) == 0);
const float* i0 = input;
float* o0 = output;
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
float* o1 = (float*) ((uintptr_t) o0 + output_stride);
const size_t input_increment = input_stride * 2 - channels;
const size_t output_increment = output_stride * 2 - channels;
const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
do {
if XNN_UNPREDICTABLE(rows < 2) {
i1 = i0;
o1 = o0;
}
const float* w = weights;
size_t c = channels;
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
const float32x4_t vscale0123 = vld1q_f32(w); w += 4;
float32x4_t vacc0x0123 = vld1q_f32(i0); i0 += 4;
float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4;
const float32x4_t vbias0123 = vld1q_f32(w); w += 4;
vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123);
vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
vst1q_f32(o0, vacc0x0123); o0 += 4;
vst1q_f32(o1, vacc1x0123); o1 += 4;
}
if XNN_UNLIKELY(c != 0) {
const float32x4_t vscale0123 = vld1q_f32(w);
float32x4_t vacc0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c);
float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c);
const float32x4_t vbias0123 = vld1q_f32(w + 4);
vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123);
vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123);
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
vacc0x0123 = vminq_f32(vacc0x0123, vmax);
vacc1x0123 = vminq_f32(vacc1x0123, vmax);
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
if (c & (2 * sizeof(float))) {
vst1_f32(o0, vacc0x01); o0 += 2;
vst1_f32(o1, vacc1x01); o1 += 2;
vacc0x01 = vget_high_f32(vacc0x0123);
vacc1x01 = vget_high_f32(vacc1x0123);
}
if (c & (1 * sizeof(float))) {
vst1_lane_f32(o0, vacc0x01, 0); o0 += 1;
vst1_lane_f32(o1, vacc1x01, 0); o1 += 1;
}
}
i0 = (const float*) ((uintptr_t) i0 + input_increment);
o0 = (float*) ((uintptr_t) o0 + output_increment);
i1 = (const float*) ((uintptr_t) i1 + input_increment);
o1 = (float*) ((uintptr_t) o1 + output_increment);
rows = doz(rows, 2);
} while (rows != 0);
}
extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64];
void xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16(
size_t batch,
const float* input,
float* output,
const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
assert(output != NULL);
const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.magic_bias);
const float32x4_t vminus_log2e = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.minus_log2e);
const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F));
const float32x4_t vln2 = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.ln2);
const float32x4_t vc2 = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.c2);
const float32x4_t vone = vmovq_n_f32(1.0f);
const float32x4_t vdenorm_cutoff = vld1q_dup_f32(&params->neonfma_rr1_lut64_p2.denorm_cutoff);
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
const float32x4_t vx0123 = vld1q_f32(input); input += 4;
const float32x4_t vx4567 = vld1q_f32(input); input += 4;
const float32x4_t vx89AB = vld1q_f32(input); input += 4;
const float32x4_t vxCDEF = vld1q_f32(input); input += 4;
const float32x4_t vz0123 = vabsq_f32(vx0123);
const float32x4_t vz4567 = vabsq_f32(vx4567);
const float32x4_t vz89AB = vabsq_f32(vx89AB);
const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
const int32x4_t ve0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 17);
const int32x4_t ve4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 17);
const int32x4_t ve89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 17);
const int32x4_t veCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 17);
// Use bits 0:6 bits of batch, as integer, as an index for table lookup of l := 2**(batch % 64).
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx01]);
float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx23]);
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx45]);
float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx67]);
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx89]);
float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxAB]);
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxCD]);
float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxEF]);
vl01 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1);
vl23 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1);
const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
vl45 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1);
vl67 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1);
const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
vl89 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1);
vlAB = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1);
const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
vlCD = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1);
vlEF = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1);
const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
vn0123 = vsubq_f32(vn0123, vmagic_bias);
vn4567 = vsubq_f32(vn4567, vmagic_bias);
vn89AB = vsubq_f32(vn89AB, vmagic_bias);
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2);
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2);
float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2);
float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2);
float32x4_t vp0123 = vmulq_f32(vt0123, vc2);
float32x4_t vp4567 = vmulq_f32(vt4567, vc2);
float32x4_t vp89AB = vmulq_f32(vt89AB, vc2);
float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2);
vp0123 = vfmsq_f32(vt0123, vp0123, vt0123);
vp4567 = vfmsq_f32(vt4567, vp4567, vt4567);
vp89AB = vfmsq_f32(vt89AB, vp89AB, vt89AB);
vpCDEF = vfmsq_f32(vtCDEF, vpCDEF, vtCDEF);
const float32x4_t vy0123 = vfmsq_f32(vs0123, vs0123, vp0123);
const float32x4_t vy4567 = vfmsq_f32(vs4567, vs4567, vp4567);
const float32x4_t vy89AB = vfmsq_f32(vs89AB, vs89AB, vp89AB);
const float32x4_t vyCDEF = vfmsq_f32(vsCDEF, vsCDEF, vpCDEF);
const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
float32x4_t vr0123 = vrecpeq_f32(vd0123);
float32x4_t vr4567 = vrecpeq_f32(vd4567);
float32x4_t vr89AB = vrecpeq_f32(vd89AB);
float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
vst1q_f32(output, vf0123); output += 4;
vst1q_f32(output, vf4567); output += 4;
vst1q_f32(output, vf89AB); output += 4;
vst1q_f32(output, vfCDEF); output += 4;
}
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
const float32x4_t vx = vld1q_f32(input); input += 4;
const float32x4_t vz = vabsq_f32(vx);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17);
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]);
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]);
vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vz, vn, vln2);
float32x4_t vp = vmulq_f32(vt, vc2);
vp = vfmsq_f32(vt, vp, vt);
const float32x4_t vy = vfmsq_f32(vs, vs, vp);
const float32x4_t vd = vaddq_f32(vy, vone);
float32x4_t vr = vrecpeq_f32(vd);
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
float32x4_t vf = vmulq_f32(vy, vr);
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
vst1q_f32(output, vf); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
const float32x4_t vx = vld1q_f32(input);
const float32x4_t vz = vabsq_f32(vx);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17);
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]);
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]);
vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
vn = vsubq_f32(vn, vmagic_bias);
float32x4_t vt = vfmaq_f32(vz, vn, vln2);
float32x4_t vp = vmulq_f32(vt, vc2);
vp = vfmsq_f32(vt, vp, vt);
const float32x4_t vy = vfmsq_f32(vs, vs, vp);
const float32x4_t vd = vaddq_f32(vy, vone);
float32x4_t vr = vrecpeq_f32(vd);
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
float32x4_t vf = vmulq_f32(vy, vr);
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
float32x2_t vf_lo = vget_low_f32(vf);
if (batch & (2 * sizeof(float))) {
vst1_f32(output, vf_lo); output += 2;
vf_lo = vget_high_f32(vf);
}
if (batch & (1 * sizeof(float))) {
vst1_lane_f32(output, vf_lo, 0);
}
}
}
void xnn_f32_vtanh_ukernel__neonfma_expm1minus_rr1_p6h5ts_nr2fma_x8(
size_t batch,
const float* input,
float* output,
const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
const float32x4_t vsat_cutoff = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.sat_cutoff);
const float32x4_t vminus_log2e = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.minus_log2e);
const float32x4_t vmagic_bias = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.magic_bias);
const float32x4_t vln2 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.ln2);
const float32x4_t vc6 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c6);
const float32x4_t vc5 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c5);
const float32x4_t vc4 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c4);
const float32x4_t vc3 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c3);
const float32x4_t vc2 = vld1q_dup_f32(&params->neon_expm1minus_rr1_p6h5.c2);
const float32x4_t vone = vmovq_n_f32(1.0f);
const float32x4_t vtwo = vmovq_n_f32(2.0f);
const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000));
for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
const float32x4_t vx0123 = vld1q_f32(input); input += 4;
const float32x4_t vx4567 = vld1q_f32(input); input += 4;
float32x4_t vz0123 = vabsq_f32(vx0123);
float32x4_t vz4567 = vabsq_f32(vx4567);
vz0123 = vminq_f32(vz0123, vsat_cutoff);
vz4567 = vminq_f32(vz4567, vsat_cutoff);
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
vn0123 = vsubq_f32(vn0123, vmagic_bias);
vn4567 = vsubq_f32(vn4567, vmagic_bias);
const float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2);
const float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2);
float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123);
float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567);
vp0123 = vfmaq_f32(vc4, vp0123, vt0123);
vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
vp4567 = vfmaq_f32(vc4, vp4567, vt4567);
vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
vp0123 = vfmsq_f32(vtwo, vp0123, vt0123);
vp4567 = vfmsq_f32(vtwo, vp4567, vt4567);
const float32x4_t vts0123 = vmulq_f32(vt0123, vs0123);
const float32x4_t vsmo0123 = vsubq_f32(vs0123, vone);
const float32x4_t vts4567 = vmulq_f32(vt4567, vs4567);
const float32x4_t vsmo4567 = vsubq_f32(vs4567, vone);
const float32x4_t vemo0123 = vfmsq_f32(vsmo0123, vp0123, vts0123);
const float32x4_t vemo4567 = vfmsq_f32(vsmo4567, vp4567, vts4567);
const float32x4_t vepo0123 = vaddq_f32(vemo0123, vtwo);
const float32x4_t vepo4567 = vaddq_f32(vemo4567, vtwo);
float32x4_t vrepo0123 = vrecpeq_f32(vepo0123);
float32x4_t vrepo4567 = vrecpeq_f32(vepo4567);
float32x4_t verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123);
float32x4_t verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567);
vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123);
vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567);
verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123);
verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567);
vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123);
vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567);
float32x4_t vy0123 = vmulq_f32(vemo0123, vrepo0123);
float32x4_t vy4567 = vmulq_f32(vemo4567, vrepo4567);
vy0123 = vbslq_f32(vsign_mask, vx0123, vy0123);
vy4567 = vbslq_f32(vsign_mask, vx4567, vy4567);
vst1q_f32(output, vy0123); output += 4;
vst1q_f32(output, vy4567); output += 4;
}
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
const float32x4_t vx = vld1q_f32(input); input += 4;
float32x4_t vz = vabsq_f32(vx);
vz = vminq_f32(vz, vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
vn = vsubq_f32(vn, vmagic_bias);
const float32x4_t vt = vfmaq_f32(vz, vn, vln2);
float32x4_t vp = vfmaq_f32(vc5, vc6, vt);
vp = vfmaq_f32(vc4, vp, vt);
vp = vfmaq_f32(vc3, vp, vt);
vp = vfmaq_f32(vc2, vp, vt);
vp = vfmsq_f32(vtwo, vp, vt);
const float32x4_t vts = vmulq_f32(vt, vs);
const float32x4_t vsmo = vsubq_f32(vs, vone);
const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts);
const float32x4_t vepo = vaddq_f32(vemo, vtwo);
float32x4_t vrepo = vrecpeq_f32(vepo);
float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo);
vrepo = vfmaq_f32(vrepo, vrepo, verepo);
verepo = vfmsq_f32(vone, vrepo, vepo);
vrepo = vfmaq_f32(vrepo, vrepo, verepo);
float32x4_t vy = vmulq_f32(vemo, vrepo);
vy = vbslq_f32(vsign_mask, vx, vy);
vst1q_f32(output, vy); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
const float32x4_t vx = vld1q_f32(input);
float32x4_t vz = vabsq_f32(vx);
vz = vminq_f32(vz, vsat_cutoff);
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
vn = vsubq_f32(vn, vmagic_bias);
const float32x4_t vt = vfmaq_f32(vz, vn, vln2);
float32x4_t vp = vfmaq_f32(vc5, vc6, vt);
vp = vfmaq_f32(vc4, vp, vt);
vp = vfmaq_f32(vc3, vp, vt);
vp = vfmaq_f32(vc2, vp, vt);
vp = vfmsq_f32(vtwo, vp, vt);
const float32x4_t vts = vmulq_f32(vt, vs);
const float32x4_t vsmo = vsubq_f32(vs, vone);
const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts);
const float32x4_t vepo = vaddq_f32(vemo, vtwo);
float32x4_t vrepo = vrecpeq_f32(vepo);
float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo);
vrepo = vfmaq_f32(vrepo, vrepo, verepo);
verepo = vfmsq_f32(vone, vrepo, vepo);
vrepo = vfmaq_f32(vrepo, vrepo, verepo);
float32x4_t vy = vmulq_f32(vemo, vrepo);
vy = vbslq_f32(vsign_mask, vx, vy);
float32x2_t vy_low = vget_low_f32(vy);
if (batch & (2 * sizeof(float))) {
vst1_f32(output, vy_low); output += 2;
vy_low = vget_high_f32(vy);
}
if (batch & (1 * sizeof(float))) {
vst1_lane_f32(output, vy_low, 0);
}
}
}