|
|
|
|
|
|
|
|
|
|
|
#include <assert.h> |
|
#include <math.h> |
|
#include <stddef.h> |
|
#include <stdint.h> |
|
|
|
#include <arm_neon.h> |
|
|
|
#include <xnnpack/common.h> |
|
#include <xnnpack/dwconv.h> |
|
#include <xnnpack/gemm.h> |
|
#include <xnnpack/ibilinear.h> |
|
#include <xnnpack/igemm.h> |
|
#include <xnnpack/math-stubs.h> |
|
#include <xnnpack/math.h> |
|
#include <xnnpack/microparams.h> |
|
#include <xnnpack/prefetch.h> |
|
#include <xnnpack/raddstoreexpminusmax.h> |
|
#include <xnnpack/spmm.h> |
|
#include <xnnpack/vmulcaddc.h> |
|
#include <xnnpack/vunary.h> |
|
|
|
|
|
void xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2( |
|
size_t channels, |
|
size_t output_width, |
|
const float** input, |
|
const float* weights, |
|
float* output, |
|
intptr_t input_stride, |
|
size_t output_increment, |
|
size_t input_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(channels != 0); |
|
assert(output_width != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
do { |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
const float* i4 = input[4]; |
|
assert(i4 != NULL); |
|
if XNN_UNPREDICTABLE(i4 != zero) { |
|
i4 = (const float*) ((uintptr_t) i4 + input_offset); |
|
} |
|
const float* i5 = input[5]; |
|
assert(i5 != NULL); |
|
if XNN_UNPREDICTABLE(i5 != zero) { |
|
i5 = (const float*) ((uintptr_t) i5 + input_offset); |
|
} |
|
const float* i6 = input[6]; |
|
assert(i6 != NULL); |
|
if XNN_UNPREDICTABLE(i6 != zero) { |
|
i6 = (const float*) ((uintptr_t) i6 + input_offset); |
|
} |
|
const float* i7 = input[7]; |
|
assert(i7 != NULL); |
|
if XNN_UNPREDICTABLE(i7 != zero) { |
|
i7 = (const float*) ((uintptr_t) i7 + input_offset); |
|
} |
|
const float* i8 = input[8]; |
|
assert(i8 != NULL); |
|
if XNN_UNPREDICTABLE(i8 != zero) { |
|
i8 = (const float*) ((uintptr_t) i8 + input_offset); |
|
} |
|
const float* i9 = input[9]; |
|
assert(i9 != NULL); |
|
if XNN_UNPREDICTABLE(i9 != zero) { |
|
i9 = (const float*) ((uintptr_t) i9 + input_offset); |
|
} |
|
const float* i10 = input[10]; |
|
assert(i10 != NULL); |
|
if XNN_UNPREDICTABLE(i10 != zero) { |
|
i10 = (const float*) ((uintptr_t) i10 + input_offset); |
|
} |
|
const float* i11 = input[11]; |
|
assert(i11 != NULL); |
|
if XNN_UNPREDICTABLE(i11 != zero) { |
|
i11 = (const float*) ((uintptr_t) i11 + input_offset); |
|
} |
|
const float* i12 = input[12]; |
|
assert(i12 != NULL); |
|
if XNN_UNPREDICTABLE(i12 != zero) { |
|
i12 = (const float*) ((uintptr_t) i12 + input_offset); |
|
} |
|
const float* i13 = input[13]; |
|
assert(i13 != NULL); |
|
if XNN_UNPREDICTABLE(i13 != zero) { |
|
i13 = (const float*) ((uintptr_t) i13 + input_offset); |
|
} |
|
const float* i14 = input[14]; |
|
assert(i14 != NULL); |
|
if XNN_UNPREDICTABLE(i14 != zero) { |
|
i14 = (const float*) ((uintptr_t) i14 + input_offset); |
|
} |
|
const float* i15 = input[15]; |
|
assert(i15 != NULL); |
|
if XNN_UNPREDICTABLE(i15 != zero) { |
|
i15 = (const float*) ((uintptr_t) i15 + input_offset); |
|
} |
|
const float* i16 = input[16]; |
|
assert(i16 != NULL); |
|
if XNN_UNPREDICTABLE(i16 != zero) { |
|
i16 = (const float*) ((uintptr_t) i16 + input_offset); |
|
} |
|
const float* i17 = input[17]; |
|
assert(i17 != NULL); |
|
if XNN_UNPREDICTABLE(i17 != zero) { |
|
i17 = (const float*) ((uintptr_t) i17 + input_offset); |
|
} |
|
const float* i18 = input[18]; |
|
assert(i18 != NULL); |
|
if XNN_UNPREDICTABLE(i18 != zero) { |
|
i18 = (const float*) ((uintptr_t) i18 + input_offset); |
|
} |
|
const float* i19 = input[19]; |
|
assert(i19 != NULL); |
|
if XNN_UNPREDICTABLE(i19 != zero) { |
|
i19 = (const float*) ((uintptr_t) i19 + input_offset); |
|
} |
|
const float* i20 = input[20]; |
|
assert(i20 != NULL); |
|
if XNN_UNPREDICTABLE(i20 != zero) { |
|
i20 = (const float*) ((uintptr_t) i20 + input_offset); |
|
} |
|
const float* i21 = input[21]; |
|
assert(i21 != NULL); |
|
if XNN_UNPREDICTABLE(i21 != zero) { |
|
i21 = (const float*) ((uintptr_t) i21 + input_offset); |
|
} |
|
const float* i22 = input[22]; |
|
assert(i22 != NULL); |
|
if XNN_UNPREDICTABLE(i22 != zero) { |
|
i22 = (const float*) ((uintptr_t) i22 + input_offset); |
|
} |
|
const float* i23 = input[23]; |
|
assert(i23 != NULL); |
|
if XNN_UNPREDICTABLE(i23 != zero) { |
|
i23 = (const float*) ((uintptr_t) i23 + input_offset); |
|
} |
|
const float* i24 = input[24]; |
|
assert(i24 != NULL); |
|
if XNN_UNPREDICTABLE(i24 != zero) { |
|
i24 = (const float*) ((uintptr_t) i24 + input_offset); |
|
} |
|
|
|
input = (const float**) ((uintptr_t) input + input_stride); |
|
|
|
size_t c = channels; |
|
const float* w = weights; |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi5x4567, vk5x4567); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi7x4567, vk7x4567); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567); |
|
|
|
const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; |
|
const float32x4_t vi9x4567 = vld1q_f32(i9); i9 += 4; |
|
const float32x4_t vk9x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk9x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi9x4567, vk9x4567); |
|
|
|
const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; |
|
const float32x4_t vi10x4567 = vld1q_f32(i10); i10 += 4; |
|
const float32x4_t vk10x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk10x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi10x4567, vk10x4567); |
|
|
|
const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; |
|
const float32x4_t vi11x4567 = vld1q_f32(i11); i11 += 4; |
|
const float32x4_t vk11x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk11x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi11x4567, vk11x4567); |
|
|
|
const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; |
|
const float32x4_t vi12x4567 = vld1q_f32(i12); i12 += 4; |
|
const float32x4_t vk12x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk12x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi12x4567, vk12x4567); |
|
|
|
const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; |
|
const float32x4_t vi13x4567 = vld1q_f32(i13); i13 += 4; |
|
const float32x4_t vk13x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk13x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi13x4567, vk13x4567); |
|
|
|
const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; |
|
const float32x4_t vi14x4567 = vld1q_f32(i14); i14 += 4; |
|
const float32x4_t vk14x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk14x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi14x4567, vk14x4567); |
|
|
|
const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; |
|
const float32x4_t vi15x4567 = vld1q_f32(i15); i15 += 4; |
|
const float32x4_t vk15x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk15x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi15x4567, vk15x4567); |
|
|
|
const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; |
|
const float32x4_t vi16x4567 = vld1q_f32(i16); i16 += 4; |
|
const float32x4_t vk16x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk16x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi16x4567, vk16x4567); |
|
|
|
const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; |
|
const float32x4_t vi17x4567 = vld1q_f32(i17); i17 += 4; |
|
const float32x4_t vk17x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk17x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi17x4567, vk17x4567); |
|
|
|
const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; |
|
const float32x4_t vi18x4567 = vld1q_f32(i18); i18 += 4; |
|
const float32x4_t vk18x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk18x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi18x4567, vk18x4567); |
|
|
|
const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; |
|
const float32x4_t vi19x4567 = vld1q_f32(i19); i19 += 4; |
|
const float32x4_t vk19x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk19x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi19x4567, vk19x4567); |
|
|
|
const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; |
|
const float32x4_t vi20x4567 = vld1q_f32(i20); i20 += 4; |
|
const float32x4_t vk20x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk20x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi20x4567, vk20x4567); |
|
|
|
const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; |
|
const float32x4_t vi21x4567 = vld1q_f32(i21); i21 += 4; |
|
const float32x4_t vk21x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk21x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi21x4567, vk21x4567); |
|
|
|
const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; |
|
const float32x4_t vi22x4567 = vld1q_f32(i22); i22 += 4; |
|
const float32x4_t vk22x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk22x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi22x4567, vk22x4567); |
|
|
|
const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; |
|
const float32x4_t vi23x4567 = vld1q_f32(i23); i23 += 4; |
|
const float32x4_t vk23x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk23x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi23x4567, vk23x4567); |
|
|
|
const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; |
|
const float32x4_t vi24x4567 = vld1q_f32(i24); i24 += 4; |
|
const float32x4_t vk24x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk24x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi24x4567, vk24x4567); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
vacc4567 = vminq_f32(vacc4567, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
vst1q_f32(output, vacc4567); output += 4; |
|
} |
|
for (; c >= 4; c -= 4) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 4); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 12); |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 20); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 28); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vk4x0123 = vld1q_f32(w + 36); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vk5x0123 = vld1q_f32(w + 44); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vk6x0123 = vld1q_f32(w + 52); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vk7x0123 = vld1q_f32(w + 60); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vk8x0123 = vld1q_f32(w + 68); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
|
|
const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; |
|
const float32x4_t vk9x0123 = vld1q_f32(w + 76); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); |
|
|
|
const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; |
|
const float32x4_t vk10x0123 = vld1q_f32(w + 84); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); |
|
|
|
const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; |
|
const float32x4_t vk11x0123 = vld1q_f32(w + 92); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); |
|
|
|
const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; |
|
const float32x4_t vk12x0123 = vld1q_f32(w + 100); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); |
|
|
|
const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; |
|
const float32x4_t vk13x0123 = vld1q_f32(w + 108); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); |
|
|
|
const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; |
|
const float32x4_t vk14x0123 = vld1q_f32(w + 116); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); |
|
|
|
const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; |
|
const float32x4_t vk15x0123 = vld1q_f32(w + 124); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); |
|
|
|
const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; |
|
const float32x4_t vk16x0123 = vld1q_f32(w + 132); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); |
|
|
|
const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; |
|
const float32x4_t vk17x0123 = vld1q_f32(w + 140); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); |
|
|
|
const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; |
|
const float32x4_t vk18x0123 = vld1q_f32(w + 148); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); |
|
|
|
const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; |
|
const float32x4_t vk19x0123 = vld1q_f32(w + 156); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); |
|
|
|
const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; |
|
const float32x4_t vk20x0123 = vld1q_f32(w + 164); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); |
|
|
|
const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; |
|
const float32x4_t vk21x0123 = vld1q_f32(w + 172); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); |
|
|
|
const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; |
|
const float32x4_t vk22x0123 = vld1q_f32(w + 180); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); |
|
|
|
const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; |
|
const float32x4_t vk23x0123 = vld1q_f32(w + 188); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); |
|
|
|
const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; |
|
const float32x4_t vk24x0123 = vld1q_f32(w + 196); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 8); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 16); |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 24); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 32); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); |
|
const float32x4_t vk4x0123 = vld1q_f32(w + 40); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); |
|
const float32x4_t vk5x0123 = vld1q_f32(w + 48); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); |
|
const float32x4_t vk6x0123 = vld1q_f32(w + 56); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); |
|
const float32x4_t vk7x0123 = vld1q_f32(w + 64); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); |
|
const float32x4_t vk8x0123 = vld1q_f32(w + 72); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
|
|
const float32x4_t vi9x0123 = vld1q_f32(i9); |
|
const float32x4_t vk9x0123 = vld1q_f32(w + 80); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); |
|
|
|
const float32x4_t vi10x0123 = vld1q_f32(i10); |
|
const float32x4_t vk10x0123 = vld1q_f32(w + 88); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); |
|
|
|
const float32x4_t vi11x0123 = vld1q_f32(i11); |
|
const float32x4_t vk11x0123 = vld1q_f32(w + 96); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); |
|
|
|
const float32x4_t vi12x0123 = vld1q_f32(i12); |
|
const float32x4_t vk12x0123 = vld1q_f32(w + 104); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); |
|
|
|
const float32x4_t vi13x0123 = vld1q_f32(i13); |
|
const float32x4_t vk13x0123 = vld1q_f32(w + 112); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); |
|
|
|
const float32x4_t vi14x0123 = vld1q_f32(i14); |
|
const float32x4_t vk14x0123 = vld1q_f32(w + 120); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); |
|
|
|
const float32x4_t vi15x0123 = vld1q_f32(i15); |
|
const float32x4_t vk15x0123 = vld1q_f32(w + 128); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); |
|
|
|
const float32x4_t vi16x0123 = vld1q_f32(i16); |
|
const float32x4_t vk16x0123 = vld1q_f32(w + 136); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); |
|
|
|
const float32x4_t vi17x0123 = vld1q_f32(i17); |
|
const float32x4_t vk17x0123 = vld1q_f32(w + 144); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); |
|
|
|
const float32x4_t vi18x0123 = vld1q_f32(i18); |
|
const float32x4_t vk18x0123 = vld1q_f32(w + 152); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); |
|
|
|
const float32x4_t vi19x0123 = vld1q_f32(i19); |
|
const float32x4_t vk19x0123 = vld1q_f32(w + 160); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); |
|
|
|
const float32x4_t vi20x0123 = vld1q_f32(i20); |
|
const float32x4_t vk20x0123 = vld1q_f32(w + 168); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); |
|
|
|
const float32x4_t vi21x0123 = vld1q_f32(i21); |
|
const float32x4_t vk21x0123 = vld1q_f32(w + 176); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); |
|
|
|
const float32x4_t vi22x0123 = vld1q_f32(i22); |
|
const float32x4_t vk22x0123 = vld1q_f32(w + 184); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); |
|
|
|
const float32x4_t vi23x0123 = vld1q_f32(i23); |
|
const float32x4_t vk23x0123 = vld1q_f32(w + 192); |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); |
|
|
|
const float32x4_t vi24x0123 = vld1q_f32(i24); |
|
const float32x4_t vk24x0123 = vld1q_f32(w + 200); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
float32x2_t vacc01 = vget_low_f32(vacc0123); |
|
if (c & 2) { |
|
vst1_f32(output, vacc01); output += 2; |
|
vacc01 = vget_high_f32(vacc0123); |
|
} |
|
if (c & 1) { |
|
vst1_lane_f32(output, vacc01, 0); output += 1; |
|
} |
|
} |
|
|
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_width != 0); |
|
} |
|
|
|
void xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma( |
|
size_t channels, |
|
size_t output_width, |
|
const float** input, |
|
const float* weights, |
|
float* output, |
|
intptr_t input_stride, |
|
size_t output_increment, |
|
size_t input_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(channels != 0); |
|
assert(output_width != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
do { |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
|
|
input = (const float**) ((uintptr_t) input + input_stride); |
|
|
|
size_t c = channels; |
|
const float* w = weights; |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
vacc4567 = vminq_f32(vacc4567, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
vst1q_f32(output, vacc4567); output += 4; |
|
} |
|
for (; c >= 4; c -= 4) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 4); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 12); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 20); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 8); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 16); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 24); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
float32x2_t vacc01 = vget_low_f32(vacc0123); |
|
if (c & 2) { |
|
vst1_f32(output, vacc01); output += 2; |
|
vacc01 = vget_high_f32(vacc0123); |
|
} |
|
if (c & 1) { |
|
vst1_lane_f32(output, vacc01, 0); output += 1; |
|
} |
|
} |
|
|
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_width != 0); |
|
} |
|
|
|
void xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma( |
|
size_t channels, |
|
size_t output_width, |
|
const float** input, |
|
const float* weights, |
|
float* output, |
|
intptr_t input_stride, |
|
size_t output_increment, |
|
size_t input_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(channels != 0); |
|
assert(output_width != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
do { |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
|
|
input = (const float**) ((uintptr_t) input + input_stride); |
|
|
|
size_t c = channels; |
|
const float* w = weights; |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
vacc4567 = vminq_f32(vacc4567, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
vst1q_f32(output, vacc4567); output += 4; |
|
} |
|
for (; c >= 4; c -= 4) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 4); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 12); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 20); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 28); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 8); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 16); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 24); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 32); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
float32x2_t vacc01 = vget_low_f32(vacc0123); |
|
if (c & 2) { |
|
vst1_f32(output, vacc01); output += 2; |
|
vacc01 = vget_high_f32(vacc0123); |
|
} |
|
if (c & 1) { |
|
vst1_lane_f32(output, vacc01, 0); output += 1; |
|
} |
|
} |
|
|
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_width != 0); |
|
} |
|
|
|
void xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2( |
|
size_t channels, |
|
size_t output_width, |
|
const float** input, |
|
const float* weights, |
|
float* output, |
|
intptr_t input_stride, |
|
size_t output_increment, |
|
size_t input_offset, |
|
const float* zero, |
|
size_t kernel_size, |
|
float* buffer, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(channels != 0); |
|
assert(output_width != 0); |
|
assert(kernel_size > 5); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
do { |
|
const float* w = weights; |
|
|
|
|
|
{ |
|
float* b = buffer; |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
const float* i4 = input[4]; |
|
assert(i4 != NULL); |
|
if XNN_UNPREDICTABLE(i4 != zero) { |
|
i4 = (const float*) ((uintptr_t) i4 + input_offset); |
|
} |
|
input += 5; |
|
|
|
|
|
size_t c = round_up_po2(channels, 4); |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
|
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
|
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
|
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; |
|
|
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); |
|
|
|
vst1q_f32(b, vacc0123p0); b += 4; |
|
vst1q_f32(b, vacc4567p0); b += 4; |
|
} |
|
|
|
if (c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
|
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
|
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
|
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
|
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
vst1q_f32(b, vacc0123p0); b += 4; |
|
} |
|
} |
|
|
|
|
|
for (size_t ks = kernel_size - 5; ks > 5; ks -= 5) { |
|
float* b = buffer; |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
const float* i4 = input[4]; |
|
assert(i4 != NULL); |
|
if XNN_UNPREDICTABLE(i4 != zero) { |
|
i4 = (const float*) ((uintptr_t) i4 + input_offset); |
|
} |
|
input += 5; |
|
|
|
size_t c = round_up_po2(channels, 4); |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(b); |
|
float32x4_t vacc4567p0 = vld1q_f32(b + 4); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
|
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
|
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
|
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; |
|
|
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); |
|
|
|
vst1q_f32(b, vacc0123p0); b += 4; |
|
vst1q_f32(b, vacc4567p0); b += 4; |
|
} |
|
|
|
if (c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(b); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
|
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
|
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
|
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
|
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
vst1q_f32(b, vacc0123p0); b += 4; |
|
} |
|
} |
|
|
|
|
|
{ |
|
float* b = buffer; |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
const float* i4 = input[4]; |
|
assert(i4 != NULL); |
|
if XNN_UNPREDICTABLE(i4 != zero) { |
|
i4 = (const float*) ((uintptr_t) i4 + input_offset); |
|
} |
|
|
|
size_t c = channels; |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(b); b += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(b); b += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
|
|
float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
|
|
float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
|
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
|
|
float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
|
|
float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; |
|
|
|
float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vk4x4567 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); |
|
|
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
vacc4567 = vminq_f32(vacc4567, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
vst1q_f32(output, vacc4567); output += 4; |
|
} |
|
|
|
|
|
for (; c >= 4; c -= 4) { |
|
float32x4_t vacc0123p0 = vld1q_f32(b); b += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
|
|
float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
|
|
float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
|
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
|
|
float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
|
|
float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
|
|
float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
|
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
|
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
} |
|
|
|
if XNN_UNLIKELY(c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(b); |
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); |
|
float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); |
|
float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); |
|
float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); |
|
float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); |
|
float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
|
|
vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); |
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
float32x2_t vacc01 = vget_low_f32(vacc0123); |
|
if (c & 2) { |
|
vst1_f32(output, vacc01); output += 2; |
|
vacc01 = vget_high_f32(vacc0123); |
|
} |
|
if (c & 1) { |
|
vst1_lane_f32(output, vacc01, 0); output += 1; |
|
} |
|
} |
|
|
|
} |
|
input = (const float**) ((uintptr_t) input + input_stride); |
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_width != 0); |
|
} |
|
|
|
void xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma( |
|
size_t channels, |
|
size_t output_width, |
|
const float** input, |
|
const float* weights, |
|
float* output, |
|
intptr_t input_stride, |
|
size_t output_increment, |
|
size_t input_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(channels != 0); |
|
assert(output_width != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
do { |
|
const float* i0 = input[0]; |
|
assert(i0 != NULL); |
|
if XNN_UNPREDICTABLE(i0 != zero) { |
|
i0 = (const float*) ((uintptr_t) i0 + input_offset); |
|
} |
|
const float* i1 = input[1]; |
|
assert(i1 != NULL); |
|
if XNN_UNPREDICTABLE(i1 != zero) { |
|
i1 = (const float*) ((uintptr_t) i1 + input_offset); |
|
} |
|
const float* i2 = input[2]; |
|
assert(i2 != NULL); |
|
if XNN_UNPREDICTABLE(i2 != zero) { |
|
i2 = (const float*) ((uintptr_t) i2 + input_offset); |
|
} |
|
const float* i3 = input[3]; |
|
assert(i3 != NULL); |
|
if XNN_UNPREDICTABLE(i3 != zero) { |
|
i3 = (const float*) ((uintptr_t) i3 + input_offset); |
|
} |
|
const float* i4 = input[4]; |
|
assert(i4 != NULL); |
|
if XNN_UNPREDICTABLE(i4 != zero) { |
|
i4 = (const float*) ((uintptr_t) i4 + input_offset); |
|
} |
|
const float* i5 = input[5]; |
|
assert(i5 != NULL); |
|
if XNN_UNPREDICTABLE(i5 != zero) { |
|
i5 = (const float*) ((uintptr_t) i5 + input_offset); |
|
} |
|
const float* i6 = input[6]; |
|
assert(i6 != NULL); |
|
if XNN_UNPREDICTABLE(i6 != zero) { |
|
i6 = (const float*) ((uintptr_t) i6 + input_offset); |
|
} |
|
const float* i7 = input[7]; |
|
assert(i7 != NULL); |
|
if XNN_UNPREDICTABLE(i7 != zero) { |
|
i7 = (const float*) ((uintptr_t) i7 + input_offset); |
|
} |
|
const float* i8 = input[8]; |
|
assert(i8 != NULL); |
|
if XNN_UNPREDICTABLE(i8 != zero) { |
|
i8 = (const float*) ((uintptr_t) i8 + input_offset); |
|
} |
|
|
|
input = (const float**) ((uintptr_t) input + input_stride); |
|
|
|
size_t c = channels; |
|
const float* w = weights; |
|
for (; c >= 8; c -= 8) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi5x4567, vk5x4567); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi7x4567, vk7x4567); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; |
|
const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
vacc4567 = vminq_f32(vacc4567, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
vst1q_f32(output, vacc4567); output += 4; |
|
} |
|
for (; c >= 4; c -= 4) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 4); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 12); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 20); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 28); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; |
|
const float32x4_t vk4x0123 = vld1q_f32(w + 36); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; |
|
const float32x4_t vk5x0123 = vld1q_f32(w + 44); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; |
|
const float32x4_t vk6x0123 = vld1q_f32(w + 52); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; |
|
const float32x4_t vk7x0123 = vld1q_f32(w + 60); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; |
|
const float32x4_t vk8x0123 = vld1q_f32(w + 68); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
vst1q_f32(output, vacc0123); output += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
float32x4_t vacc0123p0 = vld1q_f32(w); |
|
|
|
|
|
const float32x4_t vi0x0123 = vld1q_f32(i0); |
|
const float32x4_t vk0x0123 = vld1q_f32(w + 8); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); |
|
|
|
const float32x4_t vi1x0123 = vld1q_f32(i1); |
|
const float32x4_t vk1x0123 = vld1q_f32(w + 16); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); |
|
|
|
const float32x4_t vi2x0123 = vld1q_f32(i2); |
|
const float32x4_t vk2x0123 = vld1q_f32(w + 24); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); |
|
|
|
const float32x4_t vi3x0123 = vld1q_f32(i3); |
|
const float32x4_t vk3x0123 = vld1q_f32(w + 32); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); |
|
|
|
const float32x4_t vi4x0123 = vld1q_f32(i4); |
|
const float32x4_t vk4x0123 = vld1q_f32(w + 40); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); |
|
|
|
const float32x4_t vi5x0123 = vld1q_f32(i5); |
|
const float32x4_t vk5x0123 = vld1q_f32(w + 48); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); |
|
|
|
const float32x4_t vi6x0123 = vld1q_f32(i6); |
|
const float32x4_t vk6x0123 = vld1q_f32(w + 56); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); |
|
|
|
const float32x4_t vi7x0123 = vld1q_f32(i7); |
|
const float32x4_t vk7x0123 = vld1q_f32(w + 64); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); |
|
|
|
const float32x4_t vi8x0123 = vld1q_f32(i8); |
|
const float32x4_t vk8x0123 = vld1q_f32(w + 72); |
|
vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); |
|
|
|
|
|
float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); |
|
vacc0123 = vminq_f32(vacc0123, vmax); |
|
|
|
float32x2_t vacc01 = vget_low_f32(vacc0123); |
|
if (c & 2) { |
|
vst1_f32(output, vacc01); output += 2; |
|
vacc01 = vget_high_f32(vacc0123); |
|
} |
|
if (c & 1) { |
|
vst1_lane_f32(output, vacc01, 0); output += 1; |
|
} |
|
} |
|
|
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_width != 0); |
|
} |
|
|
|
void xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
const float* restrict a, |
|
size_t a_stride, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 1); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
const float* a0 = a; |
|
float* c0 = c; |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
|
|
|
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
|
|
} |
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a0 = (const float*) ((uintptr_t) a0 - kc); |
|
|
|
nc -= 8; |
|
|
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
void xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
const float* restrict a, |
|
size_t a_stride, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 4); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
const float* a0 = a; |
|
float* c0 = c; |
|
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); |
|
float* c1 = (float*) ((uintptr_t) c0 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 2) { |
|
a1 = a0; |
|
c1 = c0; |
|
} |
|
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); |
|
float* c2 = (float*) ((uintptr_t) c1 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 2) { |
|
a2 = a1; |
|
c2 = c1; |
|
} |
|
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); |
|
float* c3 = (float*) ((uintptr_t) c2 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr != 4) { |
|
a3 = a2; |
|
c3 = c2; |
|
} |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc1x0123 = vacc0x0123; |
|
float32x4_t vacc1x4567 = vacc0x4567; |
|
float32x4_t vacc2x0123 = vacc0x0123; |
|
float32x4_t vacc2x4567 = vacc0x4567; |
|
float32x4_t vacc3x0123 = vacc0x0123; |
|
float32x4_t vacc3x4567 = vacc0x4567; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
float32x4_t va1 = vld1q_f32(a1); a1 += 4; |
|
float32x4_t va2 = vld1q_f32(a2); a2 += 4; |
|
float32x4_t va3 = vld1q_f32(a3); a3 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); |
|
|
|
|
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); |
|
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); |
|
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); |
|
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); |
|
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); |
|
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); |
|
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); |
|
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); |
|
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); |
|
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); |
|
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); |
|
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); |
|
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); |
|
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); |
|
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); |
|
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); |
|
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); |
|
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); |
|
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); |
|
|
|
} |
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
vacc2x0123 = vminq_f32(vacc2x0123, vmax); |
|
vacc3x0123 = vminq_f32(vacc3x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
vacc1x4567 = vminq_f32(vacc1x4567, vmax); |
|
vacc2x4567 = vminq_f32(vacc2x4567, vmax); |
|
vacc3x4567 = vminq_f32(vacc3x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); |
|
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); |
|
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); |
|
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c3, vacc3x0123); |
|
vst1q_f32(c3 + 4, vacc3x4567); |
|
c3 = (float*) ((uintptr_t) c3 + cn_stride); |
|
vst1q_f32(c2, vacc2x0123); |
|
vst1q_f32(c2 + 4, vacc2x4567); |
|
c2 = (float*) ((uintptr_t) c2 + cn_stride); |
|
vst1q_f32(c1, vacc1x0123); |
|
vst1q_f32(c1 + 4, vacc1x4567); |
|
c1 = (float*) ((uintptr_t) c1 + cn_stride); |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a3 = (const float*) ((uintptr_t) a3 - kc); |
|
a2 = (const float*) ((uintptr_t) a2 - kc); |
|
a1 = (const float*) ((uintptr_t) a1 - kc); |
|
a0 = (const float*) ((uintptr_t) a0 - kc); |
|
|
|
nc -= 8; |
|
|
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c3, vacc3x0123); c3 += 4; |
|
vst1q_f32(c2, vacc2x0123); c2 += 4; |
|
vst1q_f32(c1, vacc1x0123); c1 += 4; |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc3x0123 = vacc3x4567; |
|
vacc2x0123 = vacc2x4567; |
|
vacc1x0123 = vacc1x4567; |
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); |
|
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); |
|
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c3, vacc3x01); c3 += 2; |
|
vst1_f32(c2, vacc2x01); c2 += 2; |
|
vst1_f32(c1, vacc1x01); c1 += 2; |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc3x01 = vget_high_f32(vacc3x0123); |
|
vacc2x01 = vget_high_f32(vacc2x0123); |
|
vacc1x01 = vget_high_f32(vacc1x0123); |
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c3, vacc3x01, 0); |
|
vst1_lane_f32(c2, vacc2x01, 0); |
|
vst1_lane_f32(c1, vacc1x01, 0); |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
void xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
const float* restrict a, |
|
size_t a_stride, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 6); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
const float* a0 = a; |
|
float* c0 = c; |
|
const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); |
|
float* c1 = (float*) ((uintptr_t) c0 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 2) { |
|
a1 = a0; |
|
c1 = c0; |
|
} |
|
const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); |
|
float* c2 = (float*) ((uintptr_t) c1 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 2) { |
|
a2 = a1; |
|
c2 = c1; |
|
} |
|
const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); |
|
float* c3 = (float*) ((uintptr_t) c2 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 4) { |
|
a3 = a2; |
|
c3 = c2; |
|
} |
|
const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); |
|
float* c4 = (float*) ((uintptr_t) c3 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 4) { |
|
a4 = a3; |
|
c4 = c3; |
|
} |
|
const float* a5 = (const float*) ((uintptr_t) a4 + a_stride); |
|
float* c5 = (float*) ((uintptr_t) c4 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr != 6) { |
|
a5 = a4; |
|
c5 = c4; |
|
} |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc1x0123 = vacc0x0123; |
|
float32x4_t vacc1x4567 = vacc0x4567; |
|
float32x4_t vacc2x0123 = vacc0x0123; |
|
float32x4_t vacc2x4567 = vacc0x4567; |
|
float32x4_t vacc3x0123 = vacc0x0123; |
|
float32x4_t vacc3x4567 = vacc0x4567; |
|
float32x4_t vacc4x0123 = vacc0x0123; |
|
float32x4_t vacc4x4567 = vacc0x4567; |
|
float32x4_t vacc5x0123 = vacc0x0123; |
|
float32x4_t vacc5x4567 = vacc0x4567; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
float32x4_t va1 = vld1q_f32(a1); a1 += 4; |
|
float32x4_t va2 = vld1q_f32(a2); a2 += 4; |
|
float32x4_t va3 = vld1q_f32(a3); a3 += 4; |
|
float32x4_t va4 = vld1q_f32(a4); a4 += 4; |
|
float32x4_t va5 = vld1q_f32(a5); a5 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3); |
|
|
|
|
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); |
|
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); |
|
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); |
|
float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k); |
|
float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); |
|
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); |
|
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); |
|
const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0); |
|
const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); |
|
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); |
|
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); |
|
const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0); |
|
const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); |
|
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); |
|
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); |
|
const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1); |
|
const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); |
|
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); |
|
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); |
|
const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1); |
|
const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); |
|
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); |
|
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); |
|
const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2); |
|
const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); |
|
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); |
|
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); |
|
const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2); |
|
const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); |
|
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); |
|
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); |
|
const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3); |
|
const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); |
|
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); |
|
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); |
|
const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3); |
|
const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3); |
|
|
|
} |
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
vacc2x0123 = vminq_f32(vacc2x0123, vmax); |
|
vacc3x0123 = vminq_f32(vacc3x0123, vmax); |
|
vacc4x0123 = vminq_f32(vacc4x0123, vmax); |
|
vacc5x0123 = vminq_f32(vacc5x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
vacc1x4567 = vminq_f32(vacc1x4567, vmax); |
|
vacc2x4567 = vminq_f32(vacc2x4567, vmax); |
|
vacc3x4567 = vminq_f32(vacc3x4567, vmax); |
|
vacc4x4567 = vminq_f32(vacc4x4567, vmax); |
|
vacc5x4567 = vminq_f32(vacc5x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); |
|
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); |
|
vacc4x0123 = vmaxq_f32(vacc4x0123, vmin); |
|
vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); |
|
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); |
|
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); |
|
vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); |
|
vacc5x4567 = vmaxq_f32(vacc5x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c5, vacc5x0123); |
|
vst1q_f32(c5 + 4, vacc5x4567); |
|
c5 = (float*) ((uintptr_t) c5 + cn_stride); |
|
vst1q_f32(c4, vacc4x0123); |
|
vst1q_f32(c4 + 4, vacc4x4567); |
|
c4 = (float*) ((uintptr_t) c4 + cn_stride); |
|
vst1q_f32(c3, vacc3x0123); |
|
vst1q_f32(c3 + 4, vacc3x4567); |
|
c3 = (float*) ((uintptr_t) c3 + cn_stride); |
|
vst1q_f32(c2, vacc2x0123); |
|
vst1q_f32(c2 + 4, vacc2x4567); |
|
c2 = (float*) ((uintptr_t) c2 + cn_stride); |
|
vst1q_f32(c1, vacc1x0123); |
|
vst1q_f32(c1 + 4, vacc1x4567); |
|
c1 = (float*) ((uintptr_t) c1 + cn_stride); |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a5 = (const float*) ((uintptr_t) a5 - kc); |
|
a4 = (const float*) ((uintptr_t) a4 - kc); |
|
a3 = (const float*) ((uintptr_t) a3 - kc); |
|
a2 = (const float*) ((uintptr_t) a2 - kc); |
|
a1 = (const float*) ((uintptr_t) a1 - kc); |
|
a0 = (const float*) ((uintptr_t) a0 - kc); |
|
|
|
nc -= 8; |
|
|
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c5, vacc5x0123); c5 += 4; |
|
vst1q_f32(c4, vacc4x0123); c4 += 4; |
|
vst1q_f32(c3, vacc3x0123); c3 += 4; |
|
vst1q_f32(c2, vacc2x0123); c2 += 4; |
|
vst1q_f32(c1, vacc1x0123); c1 += 4; |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc5x0123 = vacc5x4567; |
|
vacc4x0123 = vacc4x4567; |
|
vacc3x0123 = vacc3x4567; |
|
vacc2x0123 = vacc2x4567; |
|
vacc1x0123 = vacc1x4567; |
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc5x01 = vget_low_f32(vacc5x0123); |
|
float32x2_t vacc4x01 = vget_low_f32(vacc4x0123); |
|
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); |
|
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); |
|
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c5, vacc5x01); c5 += 2; |
|
vst1_f32(c4, vacc4x01); c4 += 2; |
|
vst1_f32(c3, vacc3x01); c3 += 2; |
|
vst1_f32(c2, vacc2x01); c2 += 2; |
|
vst1_f32(c1, vacc1x01); c1 += 2; |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc5x01 = vget_high_f32(vacc5x0123); |
|
vacc4x01 = vget_high_f32(vacc4x0123); |
|
vacc3x01 = vget_high_f32(vacc3x0123); |
|
vacc2x01 = vget_high_f32(vacc2x0123); |
|
vacc1x01 = vget_high_f32(vacc1x0123); |
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c5, vacc5x01, 0); |
|
vst1_lane_f32(c4, vacc4x01, 0); |
|
vst1_lane_f32(c3, vacc3x01, 0); |
|
vst1_lane_f32(c2, vacc2x01, 0); |
|
vst1_lane_f32(c1, vacc1x01, 0); |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
void xnn_f32_ibilinear_chw_ukernel__neonfma_p8( |
|
size_t output_pixels, |
|
size_t channels, |
|
const float** restrict input, |
|
size_t input_offset, |
|
const float* restrict weights, |
|
float* restrict output, |
|
size_t input_increment) XNN_OOB_READS |
|
{ |
|
assert(output_pixels != 0); |
|
assert(channels != 0); |
|
assert(input_increment % sizeof(float) == 0); |
|
|
|
do { |
|
const float** i = input; |
|
const float* w = weights; |
|
size_t p = output_pixels; |
|
for (; p >= 8; p -= 8) { |
|
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); |
|
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); |
|
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); |
|
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); |
|
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); |
|
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); |
|
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); |
|
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); |
|
const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset); |
|
const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset); |
|
const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset); |
|
const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset); |
|
const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset); |
|
const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset); |
|
const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset); |
|
const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset); |
|
i += 2 * 8; |
|
|
|
const float32x4x2_t vw0123 = vld2q_f32(w + 0); |
|
const float32x4x2_t vw4567 = vld2q_f32(w + 8); |
|
w += 2 * 8; |
|
|
|
const float32x2_t vtltr0 = vld1_f32(itl0); |
|
const float32x2_t vblbr0 = vld1_f32(ibl0); |
|
const float32x2_t vtltr1 = vld1_f32(itl1); |
|
const float32x2_t vblbr1 = vld1_f32(ibl1); |
|
const float32x2_t vtltr2 = vld1_f32(itl2); |
|
const float32x2_t vblbr2 = vld1_f32(ibl2); |
|
const float32x2_t vtltr3 = vld1_f32(itl3); |
|
const float32x2_t vblbr3 = vld1_f32(ibl3); |
|
const float32x2_t vtltr4 = vld1_f32(itl4); |
|
const float32x2_t vblbr4 = vld1_f32(ibl4); |
|
const float32x2_t vtltr5 = vld1_f32(itl5); |
|
const float32x2_t vblbr5 = vld1_f32(ibl5); |
|
const float32x2_t vtltr6 = vld1_f32(itl6); |
|
const float32x2_t vblbr6 = vld1_f32(ibl6); |
|
const float32x2_t vtltr7 = vld1_f32(itl7); |
|
const float32x2_t vblbr7 = vld1_f32(ibl7); |
|
|
|
const float32x4_t valphah0123 = vw0123.val[0]; |
|
const float32x4_t valphav0123 = vw0123.val[1]; |
|
const float32x4_t valphah4567 = vw4567.val[0]; |
|
const float32x4_t valphav4567 = vw4567.val[1]; |
|
|
|
const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); |
|
const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); |
|
const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); |
|
const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); |
|
const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5); |
|
const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5); |
|
const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7); |
|
const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7); |
|
|
|
const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); |
|
const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); |
|
const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45); |
|
const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67); |
|
|
|
const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23); |
|
const float32x4_t vld0123 = vld_t0123.val[0]; |
|
const float32x4_t vrd0123 = vld_t0123.val[1]; |
|
const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67); |
|
const float32x4_t vld4567 = vld_t4567.val[0]; |
|
const float32x4_t vrd4567 = vld_t4567.val[1]; |
|
|
|
const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23); |
|
const float32x4_t vtl0123 = vtl_t0123.val[0]; |
|
const float32x4_t vtr0123 = vtl_t0123.val[1]; |
|
const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67); |
|
const float32x4_t vtl4567 = vtl_t4567.val[0]; |
|
const float32x4_t vtr4567 = vtl_t4567.val[1]; |
|
|
|
const float32x4_t vl0123 = vfmaq_f32(vtl0123, vld0123, valphav0123); |
|
const float32x4_t vr0123 = vfmaq_f32(vtr0123, vrd0123, valphav0123); |
|
const float32x4_t vl4567 = vfmaq_f32(vtl4567, vld4567, valphav4567); |
|
const float32x4_t vr4567 = vfmaq_f32(vtr4567, vrd4567, valphav4567); |
|
|
|
const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123); |
|
const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567); |
|
|
|
const float32x4_t vo0123 = vfmaq_f32(vl0123, vd0123, valphah0123); |
|
const float32x4_t vo4567 = vfmaq_f32(vl4567, vd4567, valphah4567); |
|
|
|
vst1q_f32(output + 0, vo0123); |
|
vst1q_f32(output + 4, vo4567); |
|
output += 8; |
|
} |
|
|
|
for (; p >= 4; p -= 4) { |
|
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); |
|
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); |
|
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); |
|
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); |
|
const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); |
|
const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); |
|
const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); |
|
const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); |
|
i += 8; |
|
|
|
const float32x4x2_t vw = vld2q_f32(w); |
|
w += 8; |
|
|
|
const float32x2_t vtltr0 = vld1_f32(itl0); |
|
const float32x2_t vblbr0 = vld1_f32(ibl0); |
|
const float32x2_t vtltr1 = vld1_f32(itl1); |
|
const float32x2_t vblbr1 = vld1_f32(ibl1); |
|
const float32x2_t vtltr2 = vld1_f32(itl2); |
|
const float32x2_t vblbr2 = vld1_f32(ibl2); |
|
const float32x2_t vtltr3 = vld1_f32(itl3); |
|
const float32x2_t vblbr3 = vld1_f32(ibl3); |
|
|
|
const float32x4_t valphah = vw.val[0]; |
|
const float32x4_t valphav = vw.val[1]; |
|
|
|
const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); |
|
const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); |
|
const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); |
|
const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); |
|
|
|
const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); |
|
const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); |
|
|
|
const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23); |
|
const float32x4_t vld = vld_t.val[0]; |
|
const float32x4_t vrd = vld_t.val[1]; |
|
|
|
const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23); |
|
const float32x4_t vtl = vtl_t.val[0]; |
|
const float32x4_t vtr = vtl_t.val[1]; |
|
|
|
const float32x4_t vl = vfmaq_f32(vtl, vld, valphav); |
|
const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav); |
|
|
|
const float32x4_t vd = vsubq_f32(vr, vl); |
|
const float32x4_t vo = vfmaq_f32(vl, vd, valphah); |
|
|
|
vst1q_f32(output, vo); |
|
output += 4; |
|
} |
|
|
|
if XNN_UNLIKELY(p != 0) { |
|
if (p & 2) { |
|
const float32x2x2_t vw = vld2_f32(w); |
|
w += 4; |
|
|
|
const float32x2_t valphah = vw.val[0]; |
|
const float32x2_t valphav = vw.val[1]; |
|
|
|
const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); |
|
const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); |
|
const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); |
|
const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); |
|
i += 4; |
|
|
|
const float32x2_t vtltr0 = vld1_f32(itl0); |
|
const float32x2_t vblbr0 = vld1_f32(ibl0); |
|
const float32x2_t vtltr1 = vld1_f32(itl1); |
|
const float32x2_t vblbr1 = vld1_f32(ibl1); |
|
|
|
const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0); |
|
const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1); |
|
|
|
const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1); |
|
const float32x2_t vld = vld_t.val[0]; |
|
const float32x2_t vrd = vld_t.val[1]; |
|
|
|
const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1); |
|
const float32x2_t vtl = vtl_t.val[0]; |
|
const float32x2_t vtr = vtl_t.val[1]; |
|
|
|
const float32x2_t vl = vfma_f32(vtl, vld, valphav); |
|
const float32x2_t vr = vfma_f32(vtr, vrd, valphav); |
|
|
|
const float32x2_t vd = vsub_f32(vr, vl); |
|
const float32x2_t vo = vfma_f32(vl, vd, valphah); |
|
|
|
vst1_f32(output, vo); |
|
output += 2; |
|
} |
|
|
|
if (p & 1) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const float alphah = *w; |
|
const float32x2_t valphav = vld1_dup_f32(w + 1); |
|
w += 2; |
|
|
|
const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); |
|
const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); |
|
i += 2; |
|
|
|
const float32x2_t vtltr = vld1_f32(itl); |
|
const float32x2_t vblbr = vld1_f32(ibl); |
|
|
|
|
|
|
|
|
|
const float32x2_t vldrd = vsub_f32(vblbr, vtltr); |
|
const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav); |
|
|
|
|
|
const float l = vget_lane_f32(vlr, 0); |
|
const float r = vget_lane_f32(vlr, 1); |
|
|
|
*output++ = l + alphah * (r - l); |
|
} |
|
} |
|
|
|
input_offset += input_increment; |
|
} while (--channels != 0); |
|
} |
|
|
|
void xnn_f32_ibilinear_ukernel__neonfma_c8( |
|
size_t output_pixels, |
|
size_t channels, |
|
const float** restrict input, |
|
size_t input_offset, |
|
const float* restrict weights, |
|
float* restrict output, |
|
size_t output_increment) XNN_OOB_READS |
|
{ |
|
assert(output_pixels != 0); |
|
assert(channels != 0); |
|
assert(channels % sizeof(float) == 0); |
|
|
|
do { |
|
const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset); |
|
const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset); |
|
const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset); |
|
const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset); |
|
input += 4; |
|
|
|
const float32x2_t valphahv = vld1_f32(weights); weights += 2; |
|
#if XNN_ARCH_ARM |
|
const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); |
|
const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); |
|
#endif |
|
|
|
size_t c = channels; |
|
for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { |
|
const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; |
|
const float32x4_t vtl4567 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vtr4567 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vbl4567 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vbr4567 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); |
|
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); |
|
const float32x4_t vtd4567 = vsubq_f32(vtr4567, vtl4567); |
|
const float32x4_t vbd4567 = vsubq_f32(vbr4567, vbl4567); |
|
|
|
#if XNN_ARCH_ARM |
|
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); |
|
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); |
|
const float32x4_t vt4567 = vfmaq_f32(vtl4567, vtd4567, valphah); |
|
const float32x4_t vb4567 = vfmaq_f32(vbl4567, vbd4567, valphah); |
|
#else |
|
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); |
|
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); |
|
const float32x4_t vt4567 = vfmaq_lane_f32(vtl4567, vtd4567, valphahv, 0); |
|
const float32x4_t vb4567 = vfmaq_lane_f32(vbl4567, vbd4567, valphahv, 0); |
|
#endif |
|
|
|
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); |
|
const float32x4_t vd4567 = vsubq_f32(vb4567, vt4567); |
|
|
|
#if XNN_ARCH_ARM |
|
const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); |
|
const float32x4_t vo4567 = vfmaq_f32(vt4567, vd4567, valphav); |
|
#else |
|
const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); |
|
const float32x4_t vo4567 = vfmaq_lane_f32(vt4567, vd4567, valphahv, 1); |
|
#endif |
|
|
|
vst1q_f32(output, vo0123); output += 4; |
|
vst1q_f32(output, vo4567); output += 4; |
|
} |
|
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { |
|
const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; |
|
const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; |
|
const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; |
|
const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; |
|
|
|
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); |
|
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); |
|
|
|
#if XNN_ARCH_ARM |
|
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); |
|
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); |
|
#else |
|
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); |
|
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); |
|
#endif |
|
|
|
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); |
|
|
|
#if XNN_ARCH_ARM |
|
const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); |
|
#else |
|
const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); |
|
#endif |
|
|
|
vst1q_f32(output, vo0123); |
|
output += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
const float32x4_t vtl0123 = vld1q_f32(i0); |
|
const float32x4_t vtr0123 = vld1q_f32(i1); |
|
const float32x4_t vbl0123 = vld1q_f32(i2); |
|
const float32x4_t vbr0123 = vld1q_f32(i3); |
|
|
|
const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); |
|
const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); |
|
|
|
#if XNN_ARCH_ARM |
|
const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); |
|
const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); |
|
#else |
|
const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); |
|
const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); |
|
#endif |
|
|
|
const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); |
|
|
|
#if XNN_ARCH_ARM |
|
float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); |
|
#else |
|
float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); |
|
#endif |
|
|
|
float32x2_t vo01 = vget_low_f32(vo0123); |
|
if (c & (2 * sizeof(float))) { |
|
vst1_f32(output, vo01); output += 2; |
|
vo01 = vget_high_f32(vo0123); |
|
} |
|
if (c & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vo01, 0); output += 1; |
|
} |
|
} |
|
|
|
output = (float*) ((uintptr_t) output + output_increment); |
|
} while (--output_pixels != 0); |
|
} |
|
|
|
void xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
size_t ks, |
|
const float** restrict a, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
size_t a_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 1); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(ks != 0); |
|
assert(ks % (1 * sizeof(void*)) == 0); |
|
assert(a_offset % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
float* c0 = c; |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
|
|
size_t p = ks; |
|
do { |
|
const float* restrict a0 = a[0]; |
|
assert(a0 != NULL); |
|
if XNN_UNPREDICTABLE(a0 != zero) { |
|
a0 = (const float*) ((uintptr_t) a0 + a_offset); |
|
} |
|
a += 1; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w + 0); |
|
const float32x4_t vb4567c0 = vld1q_f32(w + 4); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w + 8); |
|
const float32x4_t vb4567c1 = vld1q_f32(w + 12); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w + 16); |
|
const float32x4_t vb4567c2 = vld1q_f32(w + 20); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w + 24); |
|
const float32x4_t vb4567c3 = vld1q_f32(w + 28); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
|
|
|
|
w += 32; |
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
|
|
} |
|
|
|
p -= 1 * sizeof(void*); |
|
} while (p != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a = (const float**restrict) ((uintptr_t) a - ks); |
|
nc -= 8; |
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
void xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
size_t ks, |
|
const float** restrict a, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
size_t a_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 4); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(ks != 0); |
|
assert(ks % (4 * sizeof(void*)) == 0); |
|
assert(a_offset % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
float* c0 = c; |
|
float* c1 = (float*) ((uintptr_t) c0 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 2) { |
|
c1 = c0; |
|
} |
|
float* c2 = (float*) ((uintptr_t) c1 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 2) { |
|
c2 = c1; |
|
} |
|
float* c3 = (float*) ((uintptr_t) c2 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr != 4) { |
|
c3 = c2; |
|
} |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc1x0123 = vacc0x0123; |
|
float32x4_t vacc1x4567 = vacc0x4567; |
|
float32x4_t vacc2x0123 = vacc0x0123; |
|
float32x4_t vacc2x4567 = vacc0x4567; |
|
float32x4_t vacc3x0123 = vacc0x0123; |
|
float32x4_t vacc3x4567 = vacc0x4567; |
|
|
|
size_t p = ks; |
|
do { |
|
const float* restrict a0 = a[0]; |
|
assert(a0 != NULL); |
|
if XNN_UNPREDICTABLE(a0 != zero) { |
|
a0 = (const float*) ((uintptr_t) a0 + a_offset); |
|
} |
|
const float* restrict a1 = a[1]; |
|
assert(a1 != NULL); |
|
if XNN_UNPREDICTABLE(a1 != zero) { |
|
a1 = (const float*) ((uintptr_t) a1 + a_offset); |
|
} |
|
const float* restrict a2 = a[2]; |
|
assert(a2 != NULL); |
|
if XNN_UNPREDICTABLE(a2 != zero) { |
|
a2 = (const float*) ((uintptr_t) a2 + a_offset); |
|
} |
|
const float* restrict a3 = a[3]; |
|
assert(a3 != NULL); |
|
if XNN_UNPREDICTABLE(a3 != zero) { |
|
a3 = (const float*) ((uintptr_t) a3 + a_offset); |
|
} |
|
a += 4; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
float32x4_t va1 = vld1q_f32(a1); a1 += 4; |
|
float32x4_t va2 = vld1q_f32(a2); a2 += 4; |
|
float32x4_t va3 = vld1q_f32(a3); a3 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w + 0); |
|
const float32x4_t vb4567c0 = vld1q_f32(w + 4); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w + 8); |
|
const float32x4_t vb4567c1 = vld1q_f32(w + 12); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w + 16); |
|
const float32x4_t vb4567c2 = vld1q_f32(w + 20); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w + 24); |
|
const float32x4_t vb4567c3 = vld1q_f32(w + 28); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); |
|
|
|
|
|
w += 32; |
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); |
|
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); |
|
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); |
|
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); |
|
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); |
|
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); |
|
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); |
|
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); |
|
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); |
|
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); |
|
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); |
|
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); |
|
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); |
|
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); |
|
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); |
|
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); |
|
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); |
|
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); |
|
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); |
|
|
|
} |
|
|
|
p -= 4 * sizeof(void*); |
|
} while (p != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
vacc2x0123 = vminq_f32(vacc2x0123, vmax); |
|
vacc3x0123 = vminq_f32(vacc3x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
vacc1x4567 = vminq_f32(vacc1x4567, vmax); |
|
vacc2x4567 = vminq_f32(vacc2x4567, vmax); |
|
vacc3x4567 = vminq_f32(vacc3x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); |
|
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); |
|
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); |
|
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c3, vacc3x0123); |
|
vst1q_f32(c3 + 4, vacc3x4567); |
|
c3 = (float*) ((uintptr_t) c3 + cn_stride); |
|
vst1q_f32(c2, vacc2x0123); |
|
vst1q_f32(c2 + 4, vacc2x4567); |
|
c2 = (float*) ((uintptr_t) c2 + cn_stride); |
|
vst1q_f32(c1, vacc1x0123); |
|
vst1q_f32(c1 + 4, vacc1x4567); |
|
c1 = (float*) ((uintptr_t) c1 + cn_stride); |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a = (const float**restrict) ((uintptr_t) a - ks); |
|
nc -= 8; |
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c3, vacc3x0123); c3 += 4; |
|
vst1q_f32(c2, vacc2x0123); c2 += 4; |
|
vst1q_f32(c1, vacc1x0123); c1 += 4; |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc3x0123 = vacc3x4567; |
|
vacc2x0123 = vacc2x4567; |
|
vacc1x0123 = vacc1x4567; |
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); |
|
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); |
|
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c3, vacc3x01); c3 += 2; |
|
vst1_f32(c2, vacc2x01); c2 += 2; |
|
vst1_f32(c1, vacc1x01); c1 += 2; |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc3x01 = vget_high_f32(vacc3x0123); |
|
vacc2x01 = vget_high_f32(vacc2x0123); |
|
vacc1x01 = vget_high_f32(vacc1x0123); |
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c3, vacc3x01, 0); |
|
vst1_lane_f32(c2, vacc2x01, 0); |
|
vst1_lane_f32(c1, vacc1x01, 0); |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
void xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma( |
|
size_t mr, |
|
size_t nc, |
|
size_t kc, |
|
size_t ks, |
|
const float** restrict a, |
|
const float* restrict w, |
|
float* restrict c, |
|
size_t cm_stride, |
|
size_t cn_stride, |
|
size_t a_offset, |
|
const float* zero, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(mr != 0); |
|
assert(mr <= 6); |
|
assert(nc != 0); |
|
assert(kc != 0); |
|
assert(kc % sizeof(float) == 0); |
|
assert(ks != 0); |
|
assert(ks % (6 * sizeof(void*)) == 0); |
|
assert(a_offset % sizeof(float) == 0); |
|
assert(a != NULL); |
|
assert(w != NULL); |
|
assert(c != NULL); |
|
|
|
float* c0 = c; |
|
float* c1 = (float*) ((uintptr_t) c0 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 2) { |
|
c1 = c0; |
|
} |
|
float* c2 = (float*) ((uintptr_t) c1 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 2) { |
|
c2 = c1; |
|
} |
|
float* c3 = (float*) ((uintptr_t) c2 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr < 4) { |
|
c3 = c2; |
|
} |
|
float* c4 = (float*) ((uintptr_t) c3 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr <= 4) { |
|
c4 = c3; |
|
} |
|
float* c5 = (float*) ((uintptr_t) c4 + cm_stride); |
|
if XNN_UNPREDICTABLE(mr != 6) { |
|
c5 = c4; |
|
} |
|
|
|
do { |
|
float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; |
|
float32x4_t vacc1x0123 = vacc0x0123; |
|
float32x4_t vacc1x4567 = vacc0x4567; |
|
float32x4_t vacc2x0123 = vacc0x0123; |
|
float32x4_t vacc2x4567 = vacc0x4567; |
|
float32x4_t vacc3x0123 = vacc0x0123; |
|
float32x4_t vacc3x4567 = vacc0x4567; |
|
float32x4_t vacc4x0123 = vacc0x0123; |
|
float32x4_t vacc4x4567 = vacc0x4567; |
|
float32x4_t vacc5x0123 = vacc0x0123; |
|
float32x4_t vacc5x4567 = vacc0x4567; |
|
|
|
size_t p = ks; |
|
do { |
|
const float* restrict a0 = a[0]; |
|
assert(a0 != NULL); |
|
if XNN_UNPREDICTABLE(a0 != zero) { |
|
a0 = (const float*) ((uintptr_t) a0 + a_offset); |
|
} |
|
const float* restrict a1 = a[1]; |
|
assert(a1 != NULL); |
|
if XNN_UNPREDICTABLE(a1 != zero) { |
|
a1 = (const float*) ((uintptr_t) a1 + a_offset); |
|
} |
|
const float* restrict a2 = a[2]; |
|
assert(a2 != NULL); |
|
if XNN_UNPREDICTABLE(a2 != zero) { |
|
a2 = (const float*) ((uintptr_t) a2 + a_offset); |
|
} |
|
const float* restrict a3 = a[3]; |
|
assert(a3 != NULL); |
|
if XNN_UNPREDICTABLE(a3 != zero) { |
|
a3 = (const float*) ((uintptr_t) a3 + a_offset); |
|
} |
|
const float* restrict a4 = a[4]; |
|
assert(a4 != NULL); |
|
if XNN_UNPREDICTABLE(a4 != zero) { |
|
a4 = (const float*) ((uintptr_t) a4 + a_offset); |
|
} |
|
const float* restrict a5 = a[5]; |
|
assert(a5 != NULL); |
|
if XNN_UNPREDICTABLE(a5 != zero) { |
|
a5 = (const float*) ((uintptr_t) a5 + a_offset); |
|
} |
|
a += 6; |
|
|
|
size_t k = kc; |
|
while (k >= 4 * sizeof(float)) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 += 4; |
|
float32x4_t va1 = vld1q_f32(a1); a1 += 4; |
|
float32x4_t va2 = vld1q_f32(a2); a2 += 4; |
|
float32x4_t va3 = vld1q_f32(a3); a3 += 4; |
|
float32x4_t va4 = vld1q_f32(a4); a4 += 4; |
|
float32x4_t va5 = vld1q_f32(a5); a5 += 4; |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w + 0); |
|
const float32x4_t vb4567c0 = vld1q_f32(w + 4); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w + 8); |
|
const float32x4_t vb4567c1 = vld1q_f32(w + 12); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w + 16); |
|
const float32x4_t vb4567c2 = vld1q_f32(w + 20); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w + 24); |
|
const float32x4_t vb4567c3 = vld1q_f32(w + 28); |
|
|
|
vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3); |
|
|
|
|
|
w += 32; |
|
k -= 4 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(k != 0) { |
|
float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); |
|
float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); |
|
float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); |
|
float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); |
|
float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k); |
|
float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k); |
|
|
|
|
|
const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); |
|
const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); |
|
const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); |
|
const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); |
|
const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0); |
|
const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0); |
|
const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); |
|
const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); |
|
const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); |
|
const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); |
|
const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0); |
|
const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); |
|
const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); |
|
const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); |
|
const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); |
|
const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1); |
|
const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1); |
|
const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); |
|
const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); |
|
const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); |
|
const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); |
|
const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1); |
|
const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); |
|
const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); |
|
const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); |
|
const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); |
|
const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2); |
|
const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2); |
|
const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); |
|
const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); |
|
const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); |
|
const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); |
|
const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2); |
|
const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2); |
|
|
|
va0 = vextq_f32(va0, va0, 1); |
|
va1 = vextq_f32(va1, va1, 1); |
|
va2 = vextq_f32(va2, va2, 1); |
|
va3 = vextq_f32(va3, va3, 1); |
|
va4 = vextq_f32(va4, va4, 1); |
|
va5 = vextq_f32(va5, va5, 1); |
|
|
|
const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; |
|
const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; |
|
|
|
const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); |
|
const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); |
|
const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); |
|
const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); |
|
const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3); |
|
const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); |
|
vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3); |
|
const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); |
|
const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); |
|
const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); |
|
const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); |
|
const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3); |
|
const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); |
|
vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3); |
|
|
|
} |
|
|
|
p -= 6 * sizeof(void*); |
|
} while (p != 0); |
|
|
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
vacc2x0123 = vminq_f32(vacc2x0123, vmax); |
|
vacc3x0123 = vminq_f32(vacc3x0123, vmax); |
|
vacc4x0123 = vminq_f32(vacc4x0123, vmax); |
|
vacc5x0123 = vminq_f32(vacc5x0123, vmax); |
|
vacc0x4567 = vminq_f32(vacc0x4567, vmax); |
|
vacc1x4567 = vminq_f32(vacc1x4567, vmax); |
|
vacc2x4567 = vminq_f32(vacc2x4567, vmax); |
|
vacc3x4567 = vminq_f32(vacc3x4567, vmax); |
|
vacc4x4567 = vminq_f32(vacc4x4567, vmax); |
|
vacc5x4567 = vminq_f32(vacc5x4567, vmax); |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); |
|
vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); |
|
vacc4x0123 = vmaxq_f32(vacc4x0123, vmin); |
|
vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); |
|
vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); |
|
vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); |
|
vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); |
|
vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); |
|
vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); |
|
vacc5x4567 = vmaxq_f32(vacc5x4567, vmin); |
|
|
|
if XNN_LIKELY(nc >= 8) { |
|
vst1q_f32(c5, vacc5x0123); |
|
vst1q_f32(c5 + 4, vacc5x4567); |
|
c5 = (float*) ((uintptr_t) c5 + cn_stride); |
|
vst1q_f32(c4, vacc4x0123); |
|
vst1q_f32(c4 + 4, vacc4x4567); |
|
c4 = (float*) ((uintptr_t) c4 + cn_stride); |
|
vst1q_f32(c3, vacc3x0123); |
|
vst1q_f32(c3 + 4, vacc3x4567); |
|
c3 = (float*) ((uintptr_t) c3 + cn_stride); |
|
vst1q_f32(c2, vacc2x0123); |
|
vst1q_f32(c2 + 4, vacc2x4567); |
|
c2 = (float*) ((uintptr_t) c2 + cn_stride); |
|
vst1q_f32(c1, vacc1x0123); |
|
vst1q_f32(c1 + 4, vacc1x4567); |
|
c1 = (float*) ((uintptr_t) c1 + cn_stride); |
|
vst1q_f32(c0, vacc0x0123); |
|
vst1q_f32(c0 + 4, vacc0x4567); |
|
c0 = (float*) ((uintptr_t) c0 + cn_stride); |
|
|
|
a = (const float**restrict) ((uintptr_t) a - ks); |
|
nc -= 8; |
|
} else { |
|
if (nc & 4) { |
|
vst1q_f32(c5, vacc5x0123); c5 += 4; |
|
vst1q_f32(c4, vacc4x0123); c4 += 4; |
|
vst1q_f32(c3, vacc3x0123); c3 += 4; |
|
vst1q_f32(c2, vacc2x0123); c2 += 4; |
|
vst1q_f32(c1, vacc1x0123); c1 += 4; |
|
vst1q_f32(c0, vacc0x0123); c0 += 4; |
|
|
|
vacc5x0123 = vacc5x4567; |
|
vacc4x0123 = vacc4x4567; |
|
vacc3x0123 = vacc3x4567; |
|
vacc2x0123 = vacc2x4567; |
|
vacc1x0123 = vacc1x4567; |
|
vacc0x0123 = vacc0x4567; |
|
} |
|
float32x2_t vacc5x01 = vget_low_f32(vacc5x0123); |
|
float32x2_t vacc4x01 = vget_low_f32(vacc4x0123); |
|
float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); |
|
float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); |
|
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); |
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
if (nc & 2) { |
|
vst1_f32(c5, vacc5x01); c5 += 2; |
|
vst1_f32(c4, vacc4x01); c4 += 2; |
|
vst1_f32(c3, vacc3x01); c3 += 2; |
|
vst1_f32(c2, vacc2x01); c2 += 2; |
|
vst1_f32(c1, vacc1x01); c1 += 2; |
|
vst1_f32(c0, vacc0x01); c0 += 2; |
|
|
|
vacc5x01 = vget_high_f32(vacc5x0123); |
|
vacc4x01 = vget_high_f32(vacc4x0123); |
|
vacc3x01 = vget_high_f32(vacc3x0123); |
|
vacc2x01 = vget_high_f32(vacc2x0123); |
|
vacc1x01 = vget_high_f32(vacc1x0123); |
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
} |
|
if (nc & 1) { |
|
vst1_lane_f32(c5, vacc5x01, 0); |
|
vst1_lane_f32(c4, vacc4x01, 0); |
|
vst1_lane_f32(c3, vacc3x01, 0); |
|
vst1_lane_f32(c2, vacc2x01, 0); |
|
vst1_lane_f32(c1, vacc1x01, 0); |
|
vst1_lane_f32(c0, vacc0x01, 0); |
|
} |
|
|
|
nc = 0; |
|
} |
|
} while (nc != 0); |
|
} |
|
|
|
extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; |
|
|
|
void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16( |
|
size_t batch, |
|
const float* input, |
|
const float* max, |
|
float* output, |
|
float* sum, |
|
const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(batch != 0); |
|
assert(batch % sizeof(float) == 0); |
|
assert(input != NULL); |
|
assert(max != NULL); |
|
assert(output != NULL); |
|
assert(sum != NULL); |
|
|
|
const float32x4_t vi_max = vld1q_dup_f32(max); |
|
const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.log2e); |
|
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.magic_bias); |
|
const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); |
|
const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.minus_ln2); |
|
const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.c2); |
|
const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.denorm_cutoff); |
|
|
|
float32x4_t vacc0 = vmovq_n_f32(0.0f); |
|
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { |
|
const float32x4_t vi0123 = vld1q_f32(input); input += 4; |
|
const float32x4_t vi4567 = vld1q_f32(input); input += 4; |
|
const float32x4_t vi89AB = vld1q_f32(input); input += 4; |
|
const float32x4_t viCDEF = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); |
|
const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); |
|
const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); |
|
const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); |
|
|
|
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); |
|
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); |
|
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); |
|
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); |
|
|
|
const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
|
|
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); |
|
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); |
|
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); |
|
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); |
|
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); |
|
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); |
|
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); |
|
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); |
|
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); |
|
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); |
|
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); |
|
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); |
|
|
|
float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); |
|
float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); |
|
float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); |
|
float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); |
|
float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); |
|
float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); |
|
float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); |
|
float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); |
|
|
|
vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); |
|
vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); |
|
const float32x4_t vl0123 = vcombine_f32(vl01, vl23); |
|
vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); |
|
vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); |
|
const float32x4_t vl4567 = vcombine_f32(vl45, vl67); |
|
vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); |
|
vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); |
|
const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); |
|
vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); |
|
vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); |
|
const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); |
|
|
|
const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); |
|
const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); |
|
const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); |
|
const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); |
|
|
|
vn0123 = vsubq_f32(vn0123, vmagic_bias); |
|
vn4567 = vsubq_f32(vn4567, vmagic_bias); |
|
vn89AB = vsubq_f32(vn89AB, vmagic_bias); |
|
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); |
|
|
|
float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); |
|
float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); |
|
float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); |
|
float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); |
|
|
|
float32x4_t vp0123 = vmulq_f32(vt0123, vc2); |
|
float32x4_t vp4567 = vmulq_f32(vt4567, vc2); |
|
float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); |
|
float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); |
|
|
|
vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); |
|
vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); |
|
vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); |
|
vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); |
|
|
|
float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); |
|
float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); |
|
float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); |
|
float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); |
|
|
|
vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); |
|
vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); |
|
vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); |
|
vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); |
|
|
|
vst1q_f32(output, vf0123); output += 4; |
|
vst1q_f32(output, vf4567); output += 4; |
|
vst1q_f32(output, vf89AB); output += 4; |
|
vst1q_f32(output, vfCDEF); output += 4; |
|
|
|
vacc0 = vaddq_f32(vacc0, vf0123); |
|
vacc0 = vaddq_f32(vacc0, vf4567); |
|
vacc0 = vaddq_f32(vacc0, vf89AB); |
|
vacc0 = vaddq_f32(vacc0, vfCDEF); |
|
} |
|
|
|
float32x4_t vacc = vacc0; |
|
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { |
|
const float32x4_t vi = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vx = vsubq_f32(vi, vi_max); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); |
|
|
|
const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
|
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); |
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); |
|
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); |
|
vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); |
|
vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); |
|
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); |
|
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); |
|
|
|
float32x4_t vp = vmulq_f32(vt, vc2); |
|
vp = vfmaq_f32(vt, vt, vp); |
|
|
|
float32x4_t vf = vfmaq_f32(vs, vs, vp); |
|
|
|
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); |
|
|
|
vst1q_f32(output, vf); output += 4; |
|
|
|
vacc = vaddq_f32(vacc, vf); |
|
} |
|
#if XNN_ARCH_ARM64 |
|
float vacc_lo = vaddvq_f32(vacc); |
|
#else |
|
float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); |
|
#endif |
|
if (batch != 0) { |
|
assert(batch >= 1 * sizeof(float)); |
|
assert(batch <= 3 * sizeof(float)); |
|
const float32x4_t vi = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vx = vsubq_f32(vi, vi_max); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); |
|
|
|
const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); |
|
|
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); |
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); |
|
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); |
|
vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); |
|
vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); |
|
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); |
|
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); |
|
|
|
float32x4_t vp = vmulq_f32(vt, vc2); |
|
vp = vfmaq_f32(vt, vt, vp); |
|
|
|
float32x4_t vf = vfmaq_f32(vs, vs, vp); |
|
|
|
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); |
|
|
|
float32x2_t vf_lo = vget_low_f32(vf); |
|
if (batch & (2 * sizeof(float))) { |
|
vst1_f32(output, vf_lo); output += 2; |
|
|
|
#if XNN_ARCH_ARM64 |
|
vacc_lo += vaddv_f32(vf_lo); |
|
#else |
|
vacc_lo = vadd_f32(vacc_lo, vf_lo); |
|
#endif |
|
|
|
vf_lo = vget_high_f32(vf); |
|
} |
|
if (batch & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vf_lo, 0); |
|
|
|
#if XNN_ARCH_ARM64 |
|
vacc_lo += vget_lane_f32(vf_lo, 0); |
|
#else |
|
vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); |
|
#endif |
|
} |
|
} |
|
#if XNN_ARCH_ARM64 |
|
*sum = vacc_lo; |
|
#else |
|
vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); |
|
#endif |
|
} |
|
|
|
void xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined( |
|
size_t mc, |
|
size_t nc, |
|
const float* input, |
|
const float* weights, |
|
const int32_t* widx_dmap, |
|
const uint32_t* nidx_nnzmap, |
|
float* output, |
|
size_t output_stride, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) |
|
{ |
|
assert(mc != 0); |
|
assert(mc % sizeof(float) == 0); |
|
assert(nc != 0); |
|
|
|
#if XNN_ARCH_ARM64 |
|
const float32x4x2_t vminmax = vld2q_dup_f32(¶ms->scalar.min); |
|
const float32x4_t vmin = vminmax.val[0]; |
|
const float32x4_t vmax = vminmax.val[1]; |
|
#else |
|
const float32x2x2_t vminmax = vld2_dup_f32(¶ms->scalar.min); |
|
const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]); |
|
const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]); |
|
#endif |
|
|
|
size_t output_decrement = output_stride * nc - 32 * sizeof(float); |
|
while XNN_LIKELY(mc >= 32 * sizeof(float)) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
float32x4_t vw = vld1q_dup_f32(w); w += 1; |
|
intptr_t diff = *dmap++; |
|
float32x4_t vi0123 = vld1q_f32(input); |
|
float32x4_t vi4567 = vld1q_f32(input + 4); |
|
float32x4_t vi89AB = vld1q_f32(input + 8); |
|
float32x4_t viCDEF = vld1q_f32(input + 12); |
|
float32x4_t viGHIJ = vld1q_f32(input + 16); |
|
float32x4_t viKLMN = vld1q_f32(input + 20); |
|
float32x4_t viOPQR = vld1q_f32(input + 24); |
|
float32x4_t viSTUV = vld1q_f32(input + 28); |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x4_t vacc0123 = vw; |
|
float32x4_t vacc4567 = vw; |
|
float32x4_t vacc89AB = vw; |
|
float32x4_t vaccCDEF = vw; |
|
float32x4_t vaccGHIJ = vw; |
|
float32x4_t vaccKLMN = vw; |
|
float32x4_t vaccOPQR = vw; |
|
float32x4_t vaccSTUV = vw; |
|
vw = vld1q_dup_f32(w); w += 1; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
vacc0123 = vfmaq_f32(vacc0123, vi0123, vw); |
|
vacc4567 = vfmaq_f32(vacc4567, vi4567, vw); |
|
vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw); |
|
vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw); |
|
vaccGHIJ = vfmaq_f32(vaccGHIJ, viGHIJ, vw); |
|
vaccKLMN = vfmaq_f32(vaccKLMN, viKLMN, vw); |
|
vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw); |
|
vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
diff = *dmap++; |
|
vw = vld1q_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vi0123 = vld1q_f32(input); |
|
vi4567 = vld1q_f32(input + 4); |
|
vi89AB = vld1q_f32(input + 8); |
|
viCDEF = vld1q_f32(input + 12); |
|
viGHIJ = vld1q_f32(input + 16); |
|
viKLMN = vld1q_f32(input + 20); |
|
viOPQR = vld1q_f32(input + 24); |
|
viSTUV = vld1q_f32(input + 28); |
|
} while (--nnz != 0); |
|
} |
|
float32x4_t vout0123 = vminq_f32(vacc0123, vmax); |
|
float32x4_t vout4567 = vminq_f32(vacc4567, vmax); |
|
float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); |
|
float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); |
|
float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax); |
|
float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax); |
|
float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax); |
|
float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax); |
|
vout0123 = vmaxq_f32(vout0123, vmin); |
|
vout4567 = vmaxq_f32(vout4567, vmin); |
|
vout89AB = vmaxq_f32(vout89AB, vmin); |
|
voutCDEF = vmaxq_f32(voutCDEF, vmin); |
|
voutGHIJ = vmaxq_f32(voutGHIJ, vmin); |
|
voutKLMN = vmaxq_f32(voutKLMN, vmin); |
|
voutOPQR = vmaxq_f32(voutOPQR, vmin); |
|
voutSTUV = vmaxq_f32(voutSTUV, vmin); |
|
vst1q_f32(output, vout0123); |
|
vst1q_f32(output + 4, vout4567); |
|
vst1q_f32(output + 8, vout89AB); |
|
vst1q_f32(output + 12, voutCDEF); |
|
vst1q_f32(output + 16, voutGHIJ); |
|
vst1q_f32(output + 20, voutKLMN); |
|
vst1q_f32(output + 24, voutOPQR); |
|
vst1q_f32(output + 28, voutSTUV); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 32; |
|
mc -= 32 * sizeof(float); |
|
} |
|
if XNN_UNLIKELY(mc != 0) { |
|
output_decrement += 16 * sizeof(float); |
|
if (mc & (16 * sizeof(float))) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; |
|
float32x4_t vacc4567 = vacc0123; |
|
float32x4_t vacc89AB = vacc0123; |
|
float32x4_t vaccCDEF = vacc0123; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
const intptr_t diff = *dmap++; |
|
const float32x4_t vi0123 = vld1q_f32(input); |
|
const float32x4_t vi4567 = vld1q_f32(input + 4); |
|
const float32x4_t vi89AB = vld1q_f32(input + 8); |
|
const float32x4_t viCDEF = vld1q_f32(input + 12); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
const float32x4_t vb = vld1q_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); |
|
vacc4567 = vfmaq_f32(vacc4567, vi4567, vb); |
|
vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vb); |
|
vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vb); |
|
} while (--nnz != 0); |
|
} |
|
float32x4_t vout0123 = vminq_f32(vacc0123, vmax); |
|
float32x4_t vout4567 = vminq_f32(vacc4567, vmax); |
|
float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); |
|
float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); |
|
vout0123 = vmaxq_f32(vout0123, vmin); |
|
vout4567 = vmaxq_f32(vout4567, vmin); |
|
vout89AB = vmaxq_f32(vout89AB, vmin); |
|
voutCDEF = vmaxq_f32(voutCDEF, vmin); |
|
vst1q_f32(output, vout0123); |
|
vst1q_f32(output + 4, vout4567); |
|
vst1q_f32(output + 8, vout89AB); |
|
vst1q_f32(output + 12, voutCDEF); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 16; |
|
} |
|
output_decrement += 8 * sizeof(float); |
|
if (mc & (8 * sizeof(float))) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; |
|
float32x4_t vacc4567 = vacc0123; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
const intptr_t diff = *dmap++; |
|
const float32x4_t vi0123 = vld1q_f32(input); |
|
const float32x4_t vi4567 = vld1q_f32(input + 4); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
const float32x4_t vb = vld1q_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); |
|
vacc4567 = vfmaq_f32(vacc4567, vi4567, vb); |
|
} while (--nnz != 0); |
|
} |
|
float32x4_t vout0123 = vminq_f32(vacc0123, vmax); |
|
float32x4_t vout4567 = vminq_f32(vacc4567, vmax); |
|
vout0123 = vmaxq_f32(vout0123, vmin); |
|
vout4567 = vmaxq_f32(vout4567, vmin); |
|
vst1q_f32(output, vout0123); |
|
vst1q_f32(output + 4, vout4567); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 8; |
|
} |
|
output_decrement += 4 * sizeof(float); |
|
if (mc & (4 * sizeof(float))) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
const intptr_t diff = *dmap++; |
|
const float32x4_t vi0123 = vld1q_f32(input); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
const float32x4_t vb = vld1q_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); |
|
} while (--nnz != 0); |
|
} |
|
float32x4_t vout0123 = vminq_f32(vacc0123, vmax); |
|
vout0123 = vmaxq_f32(vout0123, vmin); |
|
vst1q_f32(output, vout0123); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 4; |
|
} |
|
output_decrement += 2 * sizeof(float); |
|
if (mc & (2 * sizeof(float))) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x2_t vacc01 = vld1_dup_f32(w); w += 1; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
const intptr_t diff = *dmap++; |
|
const float32x2_t vi01 = vld1_f32(input); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
const float32x2_t vb = vld1_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vacc01 = vfma_f32(vacc01, vi01, vb); |
|
} while (--nnz != 0); |
|
} |
|
float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); |
|
vout01 = vmax_f32(vout01, vget_low_f32(vmin)); |
|
vst1_f32(output, vout01); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 2; |
|
} |
|
output_decrement += 1 * sizeof(float); |
|
if (mc & (1 * sizeof(float))) { |
|
const float* w = weights; |
|
const int32_t* dmap = widx_dmap; |
|
const uint32_t* nnzmap = nidx_nnzmap; |
|
size_t n = nc; |
|
do { |
|
uint32_t nnz = *nnzmap++; |
|
float32x2_t vacc0 = vld1_dup_f32(w); w += 1; |
|
if XNN_LIKELY(nnz != 0) { |
|
do { |
|
const intptr_t diff = *dmap++; |
|
const float32x2_t vi0 = vld1_dup_f32(input); |
|
input = (const float*) ((uintptr_t) input + (uintptr_t) diff); |
|
xnn_prefetch_to_l1(input + 16); |
|
xnn_prefetch_to_l1(input + 32); |
|
const float32x2_t vb = vld1_dup_f32(w); w += 1; |
|
xnn_prefetch_to_l1(w + 32); |
|
vacc0 = vfma_f32(vacc0, vi0, vb); |
|
} while (--nnz != 0); |
|
} |
|
float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax)); |
|
vout0 = vmax_f32(vout0, vget_low_f32(vmin)); |
|
vst1_lane_f32(output, vout0, 0); |
|
output = (float*) ((uintptr_t) output + output_stride); |
|
} while (--n != 0); |
|
output = (float*) ((uintptr_t) output - output_decrement); |
|
input += 1; |
|
} |
|
} |
|
} |
|
|
|
extern XNN_INTERNAL const int32_t xnn_table_exp2minus_k_over_16[16]; |
|
|
|
void xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16( |
|
size_t batch, |
|
const float* input, |
|
float* output, |
|
const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(batch != 0); |
|
assert(batch % sizeof(float) == 0); |
|
assert(input != NULL); |
|
assert(output != NULL); |
|
|
|
const float32x4_t vprescale = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.prescale); |
|
const float32x4_t valpha = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.alpha); |
|
const float32x4_t vbeta = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.beta); |
|
const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.sat_cutoff); |
|
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.magic_bias); |
|
const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.log2e); |
|
const int32x4_t vindex_mask = vmovq_n_s32(0xF); |
|
const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.minus_ln2); |
|
const float32x4_t vc3 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.c3); |
|
const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.c2); |
|
const float32x4_t vone = vmovq_n_f32(1.0f); |
|
|
|
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { |
|
float32x4_t vx0123 = vld1q_f32(input); input += 4; |
|
float32x4_t vx4567 = vld1q_f32(input); input += 4; |
|
float32x4_t vx89AB = vld1q_f32(input); input += 4; |
|
float32x4_t vxCDEF = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff); |
|
const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff); |
|
const float32x4_t vz89AB = vmaxq_f32(vmulq_f32(vx89AB, vprescale), vsat_cutoff); |
|
const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e); |
|
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e); |
|
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vlog2e); |
|
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vlog2e); |
|
|
|
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask), 2)); |
|
const int32x4_t ven0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 19); |
|
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask), 2)); |
|
const int32x4_t ven4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 19); |
|
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask), 2)); |
|
const int32x4_t ven89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 19); |
|
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask), 2)); |
|
const int32x4_t venCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 19); |
|
|
|
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); |
|
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); |
|
int32x2_t vl01 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01)); |
|
int32x2_t vl23 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23)); |
|
vl01 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)), vl01, 1); |
|
vl23 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)), vl23, 1); |
|
const int32x4_t vl0123 = vcombine_s32(vl01, vl23); |
|
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); |
|
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); |
|
int32x2_t vl45 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45)); |
|
int32x2_t vl67 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67)); |
|
vl45 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)), vl45, 1); |
|
vl67 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)), vl67, 1); |
|
const int32x4_t vl4567 = vcombine_s32(vl45, vl67); |
|
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); |
|
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); |
|
int32x2_t vl89 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx89)); |
|
int32x2_t vlAB = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxAB)); |
|
vl89 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx89 >> 32)), vl89, 1); |
|
vlAB = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxAB >> 32)), vlAB, 1); |
|
const int32x4_t vl89AB = vcombine_s32(vl89, vlAB); |
|
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); |
|
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); |
|
int32x2_t vlCD = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxCD)); |
|
int32x2_t vlEF = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxEF)); |
|
vlCD = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxCD >> 32)), vlCD, 1); |
|
vlEF = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxEF >> 32)), vlEF, 1); |
|
const int32x4_t vlCDEF = vcombine_s32(vlCD, vlEF); |
|
|
|
vn0123 = vsubq_f32(vn0123, vmagic_bias); |
|
float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vl0123, ven0123)); |
|
vn4567 = vsubq_f32(vn4567, vmagic_bias); |
|
float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vl4567, ven4567)); |
|
vn89AB = vsubq_f32(vn89AB, vmagic_bias); |
|
float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vl89AB, ven89AB)); |
|
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); |
|
float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vlCDEF, venCDEF)); |
|
|
|
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2); |
|
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2); |
|
float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vminus_ln2); |
|
float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vminus_ln2); |
|
|
|
float32x4_t vp0123 = vfmaq_f32(vc2, vc3, vt0123); |
|
float32x4_t vp4567 = vfmaq_f32(vc2, vc3, vt4567); |
|
float32x4_t vp89AB = vfmaq_f32(vc2, vc3, vt89AB); |
|
float32x4_t vpCDEF = vfmaq_f32(vc2, vc3, vtCDEF); |
|
|
|
vp0123 = vmulq_f32(vp0123, vt0123); |
|
vp4567 = vmulq_f32(vp4567, vt4567); |
|
vp89AB = vmulq_f32(vp89AB, vt89AB); |
|
vpCDEF = vmulq_f32(vpCDEF, vtCDEF); |
|
|
|
vt0123 = vmulq_f32(vt0123, vs0123); |
|
vs0123 = vsubq_f32(vs0123, vone); |
|
vt4567 = vmulq_f32(vt4567, vs4567); |
|
vs4567 = vsubq_f32(vs4567, vone); |
|
vt89AB = vmulq_f32(vt89AB, vs89AB); |
|
vs89AB = vsubq_f32(vs89AB, vone); |
|
vtCDEF = vmulq_f32(vtCDEF, vsCDEF); |
|
vsCDEF = vsubq_f32(vsCDEF, vone); |
|
|
|
vp0123 = vfmaq_f32(vt0123, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vt4567, vp4567, vt4567); |
|
vp89AB = vfmaq_f32(vt89AB, vp89AB, vt89AB); |
|
vpCDEF = vfmaq_f32(vtCDEF, vpCDEF, vtCDEF); |
|
|
|
const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha); |
|
const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha); |
|
const float32x4_t ve89AB = vmulq_f32(vaddq_f32(vp89AB, vs89AB), valpha); |
|
const float32x4_t veCDEF = vmulq_f32(vaddq_f32(vpCDEF, vsCDEF), valpha); |
|
|
|
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); |
|
vx0123 = vmulq_f32(vx0123, vbeta); |
|
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); |
|
vx4567 = vmulq_f32(vx4567, vbeta); |
|
const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f)); |
|
vx89AB = vmulq_f32(vx89AB, vbeta); |
|
const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); |
|
vxCDEF = vmulq_f32(vxCDEF, vbeta); |
|
|
|
const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123); |
|
const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567); |
|
const float32x4_t vy89AB = vbslq_f32(vm89AB, ve89AB, vx89AB); |
|
const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); |
|
|
|
vst1q_f32(output, vy0123); output += 4; |
|
vst1q_f32(output, vy4567); output += 4; |
|
vst1q_f32(output, vy89AB); output += 4; |
|
vst1q_f32(output, vyCDEF); output += 4; |
|
} |
|
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { |
|
float32x4_t vx = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); |
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); |
|
const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); |
|
|
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); |
|
int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); |
|
vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); |
|
vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); |
|
|
|
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); |
|
float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); |
|
|
|
float32x4_t vp = vfmaq_f32(vc2, vc3, vt); |
|
vp = vmulq_f32(vp, vt); |
|
|
|
vt = vmulq_f32(vt, vs); |
|
vs = vsubq_f32(vs, vone); |
|
vp = vfmaq_f32(vt, vp, vt); |
|
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); |
|
|
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vx = vmulq_f32(vx, vbeta); |
|
const float32x4_t vy = vbslq_f32(vm, ve, vx); |
|
|
|
vst1q_f32(output, vy); output += 4; |
|
} |
|
if XNN_UNLIKELY(batch != 0) { |
|
float32x4_t vx = vld1q_f32(input); |
|
|
|
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); |
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); |
|
const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); |
|
|
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); |
|
int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); |
|
vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); |
|
vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); |
|
|
|
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); |
|
float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); |
|
|
|
float32x4_t vp = vfmaq_f32(vc2, vc3, vt); |
|
vp = vmulq_f32(vp, vt); |
|
|
|
vt = vmulq_f32(vt, vs); |
|
vs = vsubq_f32(vs, vone); |
|
vp = vfmaq_f32(vt, vp, vt); |
|
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); |
|
|
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vx = vmulq_f32(vx, vbeta); |
|
const float32x4_t vy = vbslq_f32(vm, ve, vx); |
|
|
|
float32x2_t vy_lo = vget_low_f32(vy); |
|
if (batch & (2 * sizeof(float))) { |
|
vst1_f32(output, vy_lo); output += 2; |
|
vy_lo = vget_high_f32(vy); |
|
} |
|
if (batch & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vy_lo, 0); |
|
} |
|
} |
|
} |
|
|
|
void xnn_f32_velu_ukernel__neonfma_rr1_p6_x8( |
|
size_t batch, |
|
const float* input, |
|
float* output, |
|
const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(batch != 0); |
|
assert(batch % sizeof(float) == 0); |
|
assert(input != NULL); |
|
assert(output != NULL); |
|
|
|
const float32x4_t vprescale = vld1q_dup_f32(¶ms->neonfma_rr1_p6.prescale); |
|
const float32x4_t valpha = vld1q_dup_f32(¶ms->neonfma_rr1_p6.alpha); |
|
const float32x4_t vbeta = vld1q_dup_f32(¶ms->neonfma_rr1_p6.beta); |
|
const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_p6.sat_cutoff); |
|
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_p6.magic_bias); |
|
const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_p6.log2e); |
|
const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.minus_ln2); |
|
const float32x4_t vc6 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c6); |
|
const float32x4_t vc5 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c5); |
|
const float32x4_t vc4 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c4); |
|
const float32x4_t vc3 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c3); |
|
const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c2); |
|
const float32x4_t vone = vmovq_n_f32(1.0f); |
|
|
|
for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { |
|
float32x4_t vx0123 = vld1q_f32(input); input += 4; |
|
float32x4_t vx4567 = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff); |
|
const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e); |
|
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e); |
|
|
|
float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); |
|
vn0123 = vsubq_f32(vn0123, vmagic_bias); |
|
float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); |
|
vn4567 = vsubq_f32(vn4567, vmagic_bias); |
|
|
|
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2); |
|
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2); |
|
|
|
float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123); |
|
float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567); |
|
|
|
vp0123 = vfmaq_f32(vc4, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vc4, vp4567, vt4567); |
|
|
|
vp0123 = vfmaq_f32(vc3, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vc3, vp4567, vt4567); |
|
|
|
vp0123 = vfmaq_f32(vc2, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vc2, vp4567, vt4567); |
|
|
|
vp0123 = vmulq_f32(vp0123, vt0123); |
|
vp4567 = vmulq_f32(vp4567, vt4567); |
|
|
|
vt0123 = vmulq_f32(vt0123, vs0123); |
|
vs0123 = vsubq_f32(vs0123, vone); |
|
vt4567 = vmulq_f32(vt4567, vs4567); |
|
vs4567 = vsubq_f32(vs4567, vone); |
|
|
|
vp0123 = vfmaq_f32(vt0123, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vt4567, vp4567, vt4567); |
|
|
|
const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha); |
|
const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha); |
|
|
|
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); |
|
vx0123 = vmulq_f32(vx0123, vbeta); |
|
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); |
|
vx4567 = vmulq_f32(vx4567, vbeta); |
|
|
|
const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123); |
|
const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567); |
|
|
|
vst1q_f32(output, vy0123); output += 4; |
|
vst1q_f32(output, vy4567); output += 4; |
|
} |
|
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { |
|
float32x4_t vx = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); |
|
float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); |
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); |
|
|
|
float32x4_t vp = vfmaq_f32(vc5, vc6, vt); |
|
vp = vfmaq_f32(vc4, vp, vt); |
|
vp = vfmaq_f32(vc3, vp, vt); |
|
vp = vfmaq_f32(vc2, vp, vt); |
|
vp = vmulq_f32(vp, vt); |
|
|
|
vt = vmulq_f32(vt, vs); |
|
vs = vsubq_f32(vs, vone); |
|
vp = vfmaq_f32(vt, vp, vt); |
|
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); |
|
|
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vx = vmulq_f32(vx, vbeta); |
|
const float32x4_t vy = vbslq_f32(vm, ve, vx); |
|
|
|
vst1q_f32(output, vy); output += 4; |
|
} |
|
if XNN_UNLIKELY(batch != 0) { |
|
float32x4_t vx = vld1q_f32(input); |
|
|
|
const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); |
|
float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); |
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); |
|
|
|
float32x4_t vp = vfmaq_f32(vc5, vc6, vt); |
|
vp = vfmaq_f32(vc4, vp, vt); |
|
vp = vfmaq_f32(vc3, vp, vt); |
|
vp = vfmaq_f32(vc2, vp, vt); |
|
vp = vmulq_f32(vp, vt); |
|
|
|
vt = vmulq_f32(vt, vs); |
|
vs = vsubq_f32(vs, vone); |
|
vp = vfmaq_f32(vt, vp, vt); |
|
const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); |
|
|
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vx = vmulq_f32(vx, vbeta); |
|
const float32x4_t vy = vbslq_f32(vm, ve, vx); |
|
|
|
float32x2_t vy_lo = vget_low_f32(vy); |
|
if (batch & (2 * sizeof(float))) { |
|
vst1_f32(output, vy_lo); output += 2; |
|
vy_lo = vget_high_f32(vy); |
|
} |
|
if (batch & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vy_lo, 0); |
|
} |
|
} |
|
} |
|
|
|
void xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x( |
|
size_t rows, |
|
size_t channels, |
|
const float* restrict input, |
|
size_t input_stride, |
|
const float* restrict weights, |
|
float* restrict output, |
|
size_t output_stride, |
|
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(rows != 0); |
|
assert(channels != 0); |
|
assert(channels % sizeof(float) == 0); |
|
|
|
const float* i0 = input; |
|
float* o0 = output; |
|
const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); |
|
float* o1 = (float*) ((uintptr_t) o0 + output_stride); |
|
|
|
const size_t input_increment = input_stride * 2 - channels; |
|
const size_t output_increment = output_stride * 2 - channels; |
|
|
|
const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
|
const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
|
do { |
|
if XNN_UNPREDICTABLE(rows < 2) { |
|
i1 = i0; |
|
o1 = o0; |
|
} |
|
|
|
const float* w = weights; |
|
size_t c = channels; |
|
for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { |
|
const float32x4_t vscale0123 = vld1q_f32(w); w += 4; |
|
|
|
float32x4_t vacc0x0123 = vld1q_f32(i0); i0 += 4; |
|
float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; |
|
|
|
|
|
const float32x4_t vbias0123 = vld1q_f32(w); w += 4; |
|
|
|
vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123); |
|
vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); |
|
|
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
|
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
|
|
vst1q_f32(o0, vacc0x0123); o0 += 4; |
|
vst1q_f32(o1, vacc1x0123); o1 += 4; |
|
} |
|
if XNN_UNLIKELY(c != 0) { |
|
const float32x4_t vscale0123 = vld1q_f32(w); |
|
|
|
float32x4_t vacc0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c); |
|
float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); |
|
|
|
|
|
const float32x4_t vbias0123 = vld1q_f32(w + 4); |
|
|
|
vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123); |
|
vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); |
|
|
|
vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); |
|
vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); |
|
|
|
vacc0x0123 = vminq_f32(vacc0x0123, vmax); |
|
vacc1x0123 = vminq_f32(vacc1x0123, vmax); |
|
|
|
float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); |
|
float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); |
|
if (c & (2 * sizeof(float))) { |
|
vst1_f32(o0, vacc0x01); o0 += 2; |
|
vst1_f32(o1, vacc1x01); o1 += 2; |
|
|
|
vacc0x01 = vget_high_f32(vacc0x0123); |
|
vacc1x01 = vget_high_f32(vacc1x0123); |
|
} |
|
if (c & (1 * sizeof(float))) { |
|
vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; |
|
vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; |
|
} |
|
} |
|
i0 = (const float*) ((uintptr_t) i0 + input_increment); |
|
o0 = (float*) ((uintptr_t) o0 + output_increment); |
|
i1 = (const float*) ((uintptr_t) i1 + input_increment); |
|
o1 = (float*) ((uintptr_t) o1 + output_increment); |
|
rows = doz(rows, 2); |
|
} while (rows != 0); |
|
} |
|
|
|
extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64]; |
|
|
|
void xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16( |
|
size_t batch, |
|
const float* input, |
|
float* output, |
|
const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
assert(batch != 0); |
|
assert(batch % sizeof(float) == 0); |
|
assert(input != NULL); |
|
assert(output != NULL); |
|
|
|
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.magic_bias); |
|
const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.minus_log2e); |
|
const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); |
|
const float32x4_t vln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.ln2); |
|
const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.c2); |
|
const float32x4_t vone = vmovq_n_f32(1.0f); |
|
const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.denorm_cutoff); |
|
|
|
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { |
|
const float32x4_t vx0123 = vld1q_f32(input); input += 4; |
|
const float32x4_t vx4567 = vld1q_f32(input); input += 4; |
|
const float32x4_t vx89AB = vld1q_f32(input); input += 4; |
|
const float32x4_t vxCDEF = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz0123 = vabsq_f32(vx0123); |
|
const float32x4_t vz4567 = vabsq_f32(vx4567); |
|
const float32x4_t vz89AB = vabsq_f32(vx89AB); |
|
const float32x4_t vzCDEF = vabsq_f32(vxCDEF); |
|
|
|
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e); |
|
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e); |
|
float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e); |
|
float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e); |
|
|
|
const int32x4_t ve0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 17); |
|
const int32x4_t ve4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 17); |
|
const int32x4_t ve89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 17); |
|
const int32x4_t veCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 17); |
|
|
|
|
|
const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); |
|
const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); |
|
const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); |
|
const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); |
|
|
|
const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); |
|
const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); |
|
float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx01]); |
|
float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx23]); |
|
const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); |
|
const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); |
|
float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx45]); |
|
float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx67]); |
|
const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); |
|
const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); |
|
float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx89]); |
|
float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxAB]); |
|
const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); |
|
const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); |
|
float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxCD]); |
|
float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxEF]); |
|
|
|
vl01 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); |
|
vl23 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); |
|
const float32x4_t vl0123 = vcombine_f32(vl01, vl23); |
|
vl45 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); |
|
vl67 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); |
|
const float32x4_t vl4567 = vcombine_f32(vl45, vl67); |
|
vl89 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); |
|
vlAB = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); |
|
const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); |
|
vlCD = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); |
|
vlEF = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); |
|
const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); |
|
|
|
const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); |
|
const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); |
|
const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); |
|
const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); |
|
|
|
vn0123 = vsubq_f32(vn0123, vmagic_bias); |
|
vn4567 = vsubq_f32(vn4567, vmagic_bias); |
|
vn89AB = vsubq_f32(vn89AB, vmagic_bias); |
|
vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); |
|
|
|
float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2); |
|
float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2); |
|
float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2); |
|
float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2); |
|
|
|
float32x4_t vp0123 = vmulq_f32(vt0123, vc2); |
|
float32x4_t vp4567 = vmulq_f32(vt4567, vc2); |
|
float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); |
|
float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); |
|
|
|
vp0123 = vfmsq_f32(vt0123, vp0123, vt0123); |
|
vp4567 = vfmsq_f32(vt4567, vp4567, vt4567); |
|
vp89AB = vfmsq_f32(vt89AB, vp89AB, vt89AB); |
|
vpCDEF = vfmsq_f32(vtCDEF, vpCDEF, vtCDEF); |
|
|
|
const float32x4_t vy0123 = vfmsq_f32(vs0123, vs0123, vp0123); |
|
const float32x4_t vy4567 = vfmsq_f32(vs4567, vs4567, vp4567); |
|
const float32x4_t vy89AB = vfmsq_f32(vs89AB, vs89AB, vp89AB); |
|
const float32x4_t vyCDEF = vfmsq_f32(vsCDEF, vsCDEF, vpCDEF); |
|
|
|
const float32x4_t vd0123 = vaddq_f32(vy0123, vone); |
|
const float32x4_t vd4567 = vaddq_f32(vy4567, vone); |
|
const float32x4_t vd89AB = vaddq_f32(vy89AB, vone); |
|
const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone); |
|
|
|
float32x4_t vr0123 = vrecpeq_f32(vd0123); |
|
float32x4_t vr4567 = vrecpeq_f32(vd4567); |
|
float32x4_t vr89AB = vrecpeq_f32(vd89AB); |
|
float32x4_t vrCDEF = vrecpeq_f32(vdCDEF); |
|
|
|
vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); |
|
vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); |
|
vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB)); |
|
vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF)); |
|
|
|
vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); |
|
vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); |
|
vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB)); |
|
vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF)); |
|
|
|
float32x4_t vf0123 = vmulq_f32(vy0123, vr0123); |
|
float32x4_t vf4567 = vmulq_f32(vy4567, vr4567); |
|
float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB); |
|
float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF); |
|
|
|
vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff))); |
|
vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff))); |
|
vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff))); |
|
vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff))); |
|
|
|
const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); |
|
const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); |
|
const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f)); |
|
const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); |
|
|
|
vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123)); |
|
vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567)); |
|
vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB)); |
|
vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF)); |
|
|
|
vst1q_f32(output, vf0123); output += 4; |
|
vst1q_f32(output, vf4567); output += 4; |
|
vst1q_f32(output, vf89AB); output += 4; |
|
vst1q_f32(output, vfCDEF); output += 4; |
|
} |
|
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { |
|
const float32x4_t vx = vld1q_f32(input); input += 4; |
|
|
|
const float32x4_t vz = vabsq_f32(vx); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); |
|
const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); |
|
|
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); |
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); |
|
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); |
|
vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); |
|
vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); |
|
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); |
|
|
|
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); |
|
vn = vsubq_f32(vn, vmagic_bias); |
|
float32x4_t vt = vfmaq_f32(vz, vn, vln2); |
|
|
|
float32x4_t vp = vmulq_f32(vt, vc2); |
|
vp = vfmsq_f32(vt, vp, vt); |
|
|
|
const float32x4_t vy = vfmsq_f32(vs, vs, vp); |
|
const float32x4_t vd = vaddq_f32(vy, vone); |
|
|
|
float32x4_t vr = vrecpeq_f32(vd); |
|
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); |
|
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); |
|
|
|
float32x4_t vf = vmulq_f32(vy, vr); |
|
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); |
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); |
|
|
|
vst1q_f32(output, vf); output += 4; |
|
} |
|
if XNN_UNLIKELY(batch != 0) { |
|
const float32x4_t vx = vld1q_f32(input); |
|
|
|
const float32x4_t vz = vabsq_f32(vx); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); |
|
const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); |
|
|
|
const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); |
|
const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); |
|
const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); |
|
float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); |
|
float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); |
|
vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); |
|
vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); |
|
const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); |
|
|
|
const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); |
|
vn = vsubq_f32(vn, vmagic_bias); |
|
float32x4_t vt = vfmaq_f32(vz, vn, vln2); |
|
|
|
float32x4_t vp = vmulq_f32(vt, vc2); |
|
vp = vfmsq_f32(vt, vp, vt); |
|
|
|
const float32x4_t vy = vfmsq_f32(vs, vs, vp); |
|
const float32x4_t vd = vaddq_f32(vy, vone); |
|
|
|
float32x4_t vr = vrecpeq_f32(vd); |
|
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); |
|
vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); |
|
|
|
float32x4_t vf = vmulq_f32(vy, vr); |
|
vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); |
|
const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); |
|
vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); |
|
|
|
float32x2_t vf_lo = vget_low_f32(vf); |
|
if (batch & (2 * sizeof(float))) { |
|
vst1_f32(output, vf_lo); output += 2; |
|
vf_lo = vget_high_f32(vf); |
|
} |
|
if (batch & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vf_lo, 0); |
|
} |
|
} |
|
} |
|
|
|
void xnn_f32_vtanh_ukernel__neonfma_expm1minus_rr1_p6h5ts_nr2fma_x8( |
|
size_t batch, |
|
const float* input, |
|
float* output, |
|
const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS |
|
{ |
|
const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.sat_cutoff); |
|
const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.minus_log2e); |
|
|
|
const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.magic_bias); |
|
|
|
const float32x4_t vln2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.ln2); |
|
|
|
const float32x4_t vc6 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c6); |
|
const float32x4_t vc5 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c5); |
|
const float32x4_t vc4 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c4); |
|
const float32x4_t vc3 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c3); |
|
const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c2); |
|
|
|
const float32x4_t vone = vmovq_n_f32(1.0f); |
|
const float32x4_t vtwo = vmovq_n_f32(2.0f); |
|
|
|
const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); |
|
|
|
for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { |
|
const float32x4_t vx0123 = vld1q_f32(input); input += 4; |
|
const float32x4_t vx4567 = vld1q_f32(input); input += 4; |
|
|
|
float32x4_t vz0123 = vabsq_f32(vx0123); |
|
float32x4_t vz4567 = vabsq_f32(vx4567); |
|
vz0123 = vminq_f32(vz0123, vsat_cutoff); |
|
vz4567 = vminq_f32(vz4567, vsat_cutoff); |
|
|
|
float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e); |
|
float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e); |
|
|
|
const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); |
|
const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); |
|
|
|
vn0123 = vsubq_f32(vn0123, vmagic_bias); |
|
vn4567 = vsubq_f32(vn4567, vmagic_bias); |
|
|
|
const float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2); |
|
const float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2); |
|
|
|
float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123); |
|
float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567); |
|
vp0123 = vfmaq_f32(vc4, vp0123, vt0123); |
|
vp0123 = vfmaq_f32(vc3, vp0123, vt0123); |
|
vp0123 = vfmaq_f32(vc2, vp0123, vt0123); |
|
vp4567 = vfmaq_f32(vc4, vp4567, vt4567); |
|
vp4567 = vfmaq_f32(vc3, vp4567, vt4567); |
|
vp4567 = vfmaq_f32(vc2, vp4567, vt4567); |
|
vp0123 = vfmsq_f32(vtwo, vp0123, vt0123); |
|
vp4567 = vfmsq_f32(vtwo, vp4567, vt4567); |
|
|
|
const float32x4_t vts0123 = vmulq_f32(vt0123, vs0123); |
|
const float32x4_t vsmo0123 = vsubq_f32(vs0123, vone); |
|
const float32x4_t vts4567 = vmulq_f32(vt4567, vs4567); |
|
const float32x4_t vsmo4567 = vsubq_f32(vs4567, vone); |
|
const float32x4_t vemo0123 = vfmsq_f32(vsmo0123, vp0123, vts0123); |
|
const float32x4_t vemo4567 = vfmsq_f32(vsmo4567, vp4567, vts4567); |
|
|
|
const float32x4_t vepo0123 = vaddq_f32(vemo0123, vtwo); |
|
const float32x4_t vepo4567 = vaddq_f32(vemo4567, vtwo); |
|
|
|
float32x4_t vrepo0123 = vrecpeq_f32(vepo0123); |
|
float32x4_t vrepo4567 = vrecpeq_f32(vepo4567); |
|
float32x4_t verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123); |
|
float32x4_t verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567); |
|
vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123); |
|
vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567); |
|
verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123); |
|
verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567); |
|
vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123); |
|
vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567); |
|
|
|
float32x4_t vy0123 = vmulq_f32(vemo0123, vrepo0123); |
|
float32x4_t vy4567 = vmulq_f32(vemo4567, vrepo4567); |
|
|
|
vy0123 = vbslq_f32(vsign_mask, vx0123, vy0123); |
|
vy4567 = vbslq_f32(vsign_mask, vx4567, vy4567); |
|
|
|
vst1q_f32(output, vy0123); output += 4; |
|
vst1q_f32(output, vy4567); output += 4; |
|
} |
|
|
|
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { |
|
const float32x4_t vx = vld1q_f32(input); input += 4; |
|
|
|
float32x4_t vz = vabsq_f32(vx); |
|
vz = vminq_f32(vz, vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); |
|
|
|
const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
const float32x4_t vt = vfmaq_f32(vz, vn, vln2); |
|
|
|
float32x4_t vp = vfmaq_f32(vc5, vc6, vt); |
|
vp = vfmaq_f32(vc4, vp, vt); |
|
vp = vfmaq_f32(vc3, vp, vt); |
|
vp = vfmaq_f32(vc2, vp, vt); |
|
vp = vfmsq_f32(vtwo, vp, vt); |
|
|
|
const float32x4_t vts = vmulq_f32(vt, vs); |
|
const float32x4_t vsmo = vsubq_f32(vs, vone); |
|
const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts); |
|
|
|
const float32x4_t vepo = vaddq_f32(vemo, vtwo); |
|
|
|
float32x4_t vrepo = vrecpeq_f32(vepo); |
|
float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo); |
|
vrepo = vfmaq_f32(vrepo, vrepo, verepo); |
|
verepo = vfmsq_f32(vone, vrepo, vepo); |
|
vrepo = vfmaq_f32(vrepo, vrepo, verepo); |
|
|
|
float32x4_t vy = vmulq_f32(vemo, vrepo); |
|
|
|
vy = vbslq_f32(vsign_mask, vx, vy); |
|
vst1q_f32(output, vy); output += 4; |
|
} |
|
if XNN_UNLIKELY(batch != 0) { |
|
const float32x4_t vx = vld1q_f32(input); |
|
|
|
float32x4_t vz = vabsq_f32(vx); |
|
vz = vminq_f32(vz, vsat_cutoff); |
|
|
|
float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); |
|
|
|
const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); |
|
|
|
vn = vsubq_f32(vn, vmagic_bias); |
|
|
|
const float32x4_t vt = vfmaq_f32(vz, vn, vln2); |
|
|
|
float32x4_t vp = vfmaq_f32(vc5, vc6, vt); |
|
vp = vfmaq_f32(vc4, vp, vt); |
|
vp = vfmaq_f32(vc3, vp, vt); |
|
vp = vfmaq_f32(vc2, vp, vt); |
|
vp = vfmsq_f32(vtwo, vp, vt); |
|
|
|
const float32x4_t vts = vmulq_f32(vt, vs); |
|
const float32x4_t vsmo = vsubq_f32(vs, vone); |
|
const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts); |
|
|
|
const float32x4_t vepo = vaddq_f32(vemo, vtwo); |
|
|
|
float32x4_t vrepo = vrecpeq_f32(vepo); |
|
float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo); |
|
vrepo = vfmaq_f32(vrepo, vrepo, verepo); |
|
verepo = vfmsq_f32(vone, vrepo, vepo); |
|
vrepo = vfmaq_f32(vrepo, vrepo, verepo); |
|
|
|
float32x4_t vy = vmulq_f32(vemo, vrepo); |
|
|
|
vy = vbslq_f32(vsign_mask, vx, vy); |
|
|
|
float32x2_t vy_low = vget_low_f32(vy); |
|
|
|
if (batch & (2 * sizeof(float))) { |
|
vst1_f32(output, vy_low); output += 2; |
|
vy_low = vget_high_f32(vy); |
|
} |
|
if (batch & (1 * sizeof(float))) { |
|
vst1_lane_f32(output, vy_low, 0); |
|
} |
|
} |
|
} |
|
|