// Copyright 2021 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void xnn_f32_dwconv_minmax_ukernel_25p8c__neonfma_acc2( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const float* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const float*) ((uintptr_t) i9 + input_offset); } const float* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const float*) ((uintptr_t) i10 + input_offset); } const float* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const float*) ((uintptr_t) i11 + input_offset); } const float* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const float*) ((uintptr_t) i12 + input_offset); } const float* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const float*) ((uintptr_t) i13 + input_offset); } const float* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const float*) ((uintptr_t) i14 + input_offset); } const float* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const float*) ((uintptr_t) i15 + input_offset); } const float* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const float*) ((uintptr_t) i16 + input_offset); } const float* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const float*) ((uintptr_t) i17 + input_offset); } const float* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const float*) ((uintptr_t) i18 + input_offset); } const float* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const float*) ((uintptr_t) i19 + input_offset); } const float* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const float*) ((uintptr_t) i20 + input_offset); } const float* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const float*) ((uintptr_t) i21 + input_offset); } const float* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const float*) ((uintptr_t) i22 + input_offset); } const float* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const float*) ((uintptr_t) i23 + input_offset); } const float* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const float*) ((uintptr_t) i24 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi5x4567, vk5x4567); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi7x4567, vk7x4567); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567); const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; const float32x4_t vi9x4567 = vld1q_f32(i9); i9 += 4; const float32x4_t vk9x0123 = vld1q_f32(w); w += 4; const float32x4_t vk9x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi9x4567, vk9x4567); const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; const float32x4_t vi10x4567 = vld1q_f32(i10); i10 += 4; const float32x4_t vk10x0123 = vld1q_f32(w); w += 4; const float32x4_t vk10x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi10x4567, vk10x4567); const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; const float32x4_t vi11x4567 = vld1q_f32(i11); i11 += 4; const float32x4_t vk11x0123 = vld1q_f32(w); w += 4; const float32x4_t vk11x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi11x4567, vk11x4567); const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; const float32x4_t vi12x4567 = vld1q_f32(i12); i12 += 4; const float32x4_t vk12x0123 = vld1q_f32(w); w += 4; const float32x4_t vk12x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi12x4567, vk12x4567); const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; const float32x4_t vi13x4567 = vld1q_f32(i13); i13 += 4; const float32x4_t vk13x0123 = vld1q_f32(w); w += 4; const float32x4_t vk13x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi13x4567, vk13x4567); const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; const float32x4_t vi14x4567 = vld1q_f32(i14); i14 += 4; const float32x4_t vk14x0123 = vld1q_f32(w); w += 4; const float32x4_t vk14x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi14x4567, vk14x4567); const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; const float32x4_t vi15x4567 = vld1q_f32(i15); i15 += 4; const float32x4_t vk15x0123 = vld1q_f32(w); w += 4; const float32x4_t vk15x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi15x4567, vk15x4567); const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; const float32x4_t vi16x4567 = vld1q_f32(i16); i16 += 4; const float32x4_t vk16x0123 = vld1q_f32(w); w += 4; const float32x4_t vk16x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi16x4567, vk16x4567); const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; const float32x4_t vi17x4567 = vld1q_f32(i17); i17 += 4; const float32x4_t vk17x0123 = vld1q_f32(w); w += 4; const float32x4_t vk17x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi17x4567, vk17x4567); const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; const float32x4_t vi18x4567 = vld1q_f32(i18); i18 += 4; const float32x4_t vk18x0123 = vld1q_f32(w); w += 4; const float32x4_t vk18x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi18x4567, vk18x4567); const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; const float32x4_t vi19x4567 = vld1q_f32(i19); i19 += 4; const float32x4_t vk19x0123 = vld1q_f32(w); w += 4; const float32x4_t vk19x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi19x4567, vk19x4567); const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; const float32x4_t vi20x4567 = vld1q_f32(i20); i20 += 4; const float32x4_t vk20x0123 = vld1q_f32(w); w += 4; const float32x4_t vk20x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi20x4567, vk20x4567); const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; const float32x4_t vi21x4567 = vld1q_f32(i21); i21 += 4; const float32x4_t vk21x0123 = vld1q_f32(w); w += 4; const float32x4_t vk21x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi21x4567, vk21x4567); const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; const float32x4_t vi22x4567 = vld1q_f32(i22); i22 += 4; const float32x4_t vk22x0123 = vld1q_f32(w); w += 4; const float32x4_t vk22x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi22x4567, vk22x4567); const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; const float32x4_t vi23x4567 = vld1q_f32(i23); i23 += 4; const float32x4_t vk23x0123 = vld1q_f32(w); w += 4; const float32x4_t vk23x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi23x4567, vk23x4567); const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; const float32x4_t vi24x4567 = vld1q_f32(i24); i24 += 4; const float32x4_t vk24x0123 = vld1q_f32(w); w += 4; const float32x4_t vk24x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi24x4567, vk24x4567); // Add up all accumulators to vacc01234567p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w + 36); vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w + 44); vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w + 52); vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w + 60); vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w + 68); vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; const float32x4_t vk9x0123 = vld1q_f32(w + 76); vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; const float32x4_t vk10x0123 = vld1q_f32(w + 84); vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; const float32x4_t vk11x0123 = vld1q_f32(w + 92); vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; const float32x4_t vk12x0123 = vld1q_f32(w + 100); vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; const float32x4_t vk13x0123 = vld1q_f32(w + 108); vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; const float32x4_t vk14x0123 = vld1q_f32(w + 116); vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; const float32x4_t vk15x0123 = vld1q_f32(w + 124); vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; const float32x4_t vk16x0123 = vld1q_f32(w + 132); vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; const float32x4_t vk17x0123 = vld1q_f32(w + 140); vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; const float32x4_t vk18x0123 = vld1q_f32(w + 148); vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; const float32x4_t vk19x0123 = vld1q_f32(w + 156); vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; const float32x4_t vk20x0123 = vld1q_f32(w + 164); vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; const float32x4_t vk21x0123 = vld1q_f32(w + 172); vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; const float32x4_t vk22x0123 = vld1q_f32(w + 180); vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; const float32x4_t vk23x0123 = vld1q_f32(w + 188); vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; const float32x4_t vk24x0123 = vld1q_f32(w + 196); vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); const float32x4_t vk4x0123 = vld1q_f32(w + 40); vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); const float32x4_t vk5x0123 = vld1q_f32(w + 48); vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); const float32x4_t vk6x0123 = vld1q_f32(w + 56); vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); const float32x4_t vk7x0123 = vld1q_f32(w + 64); vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); const float32x4_t vk8x0123 = vld1q_f32(w + 72); vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); const float32x4_t vi9x0123 = vld1q_f32(i9); const float32x4_t vk9x0123 = vld1q_f32(w + 80); vacc0123p1 = vfmaq_f32(vacc0123p1, vi9x0123, vk9x0123); const float32x4_t vi10x0123 = vld1q_f32(i10); const float32x4_t vk10x0123 = vld1q_f32(w + 88); vacc0123p0 = vfmaq_f32(vacc0123p0, vi10x0123, vk10x0123); const float32x4_t vi11x0123 = vld1q_f32(i11); const float32x4_t vk11x0123 = vld1q_f32(w + 96); vacc0123p1 = vfmaq_f32(vacc0123p1, vi11x0123, vk11x0123); const float32x4_t vi12x0123 = vld1q_f32(i12); const float32x4_t vk12x0123 = vld1q_f32(w + 104); vacc0123p0 = vfmaq_f32(vacc0123p0, vi12x0123, vk12x0123); const float32x4_t vi13x0123 = vld1q_f32(i13); const float32x4_t vk13x0123 = vld1q_f32(w + 112); vacc0123p1 = vfmaq_f32(vacc0123p1, vi13x0123, vk13x0123); const float32x4_t vi14x0123 = vld1q_f32(i14); const float32x4_t vk14x0123 = vld1q_f32(w + 120); vacc0123p0 = vfmaq_f32(vacc0123p0, vi14x0123, vk14x0123); const float32x4_t vi15x0123 = vld1q_f32(i15); const float32x4_t vk15x0123 = vld1q_f32(w + 128); vacc0123p1 = vfmaq_f32(vacc0123p1, vi15x0123, vk15x0123); const float32x4_t vi16x0123 = vld1q_f32(i16); const float32x4_t vk16x0123 = vld1q_f32(w + 136); vacc0123p0 = vfmaq_f32(vacc0123p0, vi16x0123, vk16x0123); const float32x4_t vi17x0123 = vld1q_f32(i17); const float32x4_t vk17x0123 = vld1q_f32(w + 144); vacc0123p1 = vfmaq_f32(vacc0123p1, vi17x0123, vk17x0123); const float32x4_t vi18x0123 = vld1q_f32(i18); const float32x4_t vk18x0123 = vld1q_f32(w + 152); vacc0123p0 = vfmaq_f32(vacc0123p0, vi18x0123, vk18x0123); const float32x4_t vi19x0123 = vld1q_f32(i19); const float32x4_t vk19x0123 = vld1q_f32(w + 160); vacc0123p1 = vfmaq_f32(vacc0123p1, vi19x0123, vk19x0123); const float32x4_t vi20x0123 = vld1q_f32(i20); const float32x4_t vk20x0123 = vld1q_f32(w + 168); vacc0123p0 = vfmaq_f32(vacc0123p0, vi20x0123, vk20x0123); const float32x4_t vi21x0123 = vld1q_f32(i21); const float32x4_t vk21x0123 = vld1q_f32(w + 176); vacc0123p1 = vfmaq_f32(vacc0123p1, vi21x0123, vk21x0123); const float32x4_t vi22x0123 = vld1q_f32(i22); const float32x4_t vk22x0123 = vld1q_f32(w + 184); vacc0123p0 = vfmaq_f32(vacc0123p0, vi22x0123, vk22x0123); const float32x4_t vi23x0123 = vld1q_f32(i23); const float32x4_t vk23x0123 = vld1q_f32(w + 192); vacc0123p1 = vfmaq_f32(vacc0123p1, vi23x0123, vk23x0123); const float32x4_t vi24x0123 = vld1q_f32(i24); const float32x4_t vk24x0123 = vld1q_f32(w + 200); vacc0123p0 = vfmaq_f32(vacc0123p0, vi24x0123, vk24x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3p8c__neonfma( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_4p8c__neonfma( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_5f5m5l8c4s4r__neonfma_acc2( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 5); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* w = weights; // First pass to process 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; // Process c channels and write to buffer. size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); vst1q_f32(b, vacc0123p0); b += 4; vst1q_f32(b, vacc4567p0); b += 4; } if (c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vst1q_f32(b, vacc0123p0); b += 4; } } // Middle pass to process 5 inputs in each iteration. for (size_t ks = kernel_size - 5; ks > 5; ks -= 5) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(b); float32x4_t vacc4567p0 = vld1q_f32(b + 4); const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); vst1q_f32(b, vacc0123p0); b += 4; vst1q_f32(b, vacc4567p0); b += 4; } if (c != 0) { float32x4_t vacc0123p0 = vld1q_f32(b); const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vst1q_f32(b, vacc0123p0); b += 4; } } // Last pass to process up to 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } size_t c = channels; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(b); b += 4; float32x4_t vacc4567p0 = vld1q_f32(b); b += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; float32x4_t vk0x0123 = vld1q_f32(w); w += 4; float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vk1x4567 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; float32x4_t vk2x0123 = vld1q_f32(w); w += 4; float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; float32x4_t vk3x0123 = vld1q_f32(w); w += 4; float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; float32x4_t vk4x0123 = vld1q_f32(w); w += 4; float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(b); b += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(b); const float32x4_t vi0x0123 = vld1q_f32(i0); float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_9p8c__neonfma( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi5x4567, vk5x4567); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi7x4567, vk7x4567); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w + 36); vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w + 44); vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w + 52); vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w + 60); vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w + 68); vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); const float32x4_t vk4x0123 = vld1q_f32(w + 40); vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); const float32x4_t vk5x0123 = vld1q_f32(w + 48); vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); const float32x4_t vk6x0123 = vld1q_f32(w + 56); vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); const float32x4_t vk7x0123 = vld1q_f32(w + 64); vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); const float32x4_t vk8x0123 = vld1q_f32(w + 72); vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; float32x4_t va1 = vld1q_f32(a1); a1 += 4; float32x4_t va2 = vld1q_f32(a2); a2 += 4; float32x4_t va3 = vld1q_f32(a3); a3 += 4; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 6); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } const float* a5 = (const float*) ((uintptr_t) a4 + a_stride); float* c5 = (float*) ((uintptr_t) c4 + cm_stride); if XNN_UNPREDICTABLE(mr != 6) { a5 = a4; c5 = c4; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; float32x4_t vacc4x0123 = vacc0x0123; float32x4_t vacc4x4567 = vacc0x4567; float32x4_t vacc5x0123 = vacc0x0123; float32x4_t vacc5x4567 = vacc0x4567; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; float32x4_t va1 = vld1q_f32(a1); a1 += 4; float32x4_t va2 = vld1q_f32(a2); a2 += 4; float32x4_t va3 = vld1q_f32(a3); a3 += 4; float32x4_t va4 = vld1q_f32(a4); a4 += 4; float32x4_t va5 = vld1q_f32(a5); a5 += 4; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3); k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k); float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0); const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0); const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1); const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1); const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2); const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2); const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3); const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3); const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3); } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc4x0123 = vminq_f32(vacc4x0123, vmax); vacc5x0123 = vminq_f32(vacc5x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); vacc4x4567 = vminq_f32(vacc4x4567, vmax); vacc5x4567 = vminq_f32(vacc5x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc4x0123 = vmaxq_f32(vacc4x0123, vmin); vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); vacc5x4567 = vmaxq_f32(vacc5x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c5, vacc5x0123); vst1q_f32(c5 + 4, vacc5x4567); c5 = (float*) ((uintptr_t) c5 + cn_stride); vst1q_f32(c4, vacc4x0123); vst1q_f32(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a5 = (const float*) ((uintptr_t) a5 - kc); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c5, vacc5x0123); c5 += 4; vst1q_f32(c4, vacc4x0123); c4 += 4; vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc5x0123 = vacc5x4567; vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc5x01 = vget_low_f32(vacc5x0123); float32x2_t vacc4x01 = vget_low_f32(vacc4x0123); float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c5, vacc5x01); c5 += 2; vst1_f32(c4, vacc4x01); c4 += 2; vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc5x01 = vget_high_f32(vacc5x0123); vacc4x01 = vget_high_f32(vacc4x0123); vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c5, vacc5x01, 0); vst1_lane_f32(c4, vacc4x01, 0); vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_ibilinear_chw_ukernel__neonfma_p8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t input_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(input_increment % sizeof(float) == 0); do { const float** i = input; const float* w = weights; size_t p = output_pixels; for (; p >= 8; p -= 8) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset); const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset); const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset); const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset); const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset); const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset); const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset); const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset); i += 2 * 8; const float32x4x2_t vw0123 = vld2q_f32(w + 0); const float32x4x2_t vw4567 = vld2q_f32(w + 8); w += 2 * 8; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vtltr2 = vld1_f32(itl2); const float32x2_t vblbr2 = vld1_f32(ibl2); const float32x2_t vtltr3 = vld1_f32(itl3); const float32x2_t vblbr3 = vld1_f32(ibl3); const float32x2_t vtltr4 = vld1_f32(itl4); const float32x2_t vblbr4 = vld1_f32(ibl4); const float32x2_t vtltr5 = vld1_f32(itl5); const float32x2_t vblbr5 = vld1_f32(ibl5); const float32x2_t vtltr6 = vld1_f32(itl6); const float32x2_t vblbr6 = vld1_f32(ibl6); const float32x2_t vtltr7 = vld1_f32(itl7); const float32x2_t vblbr7 = vld1_f32(ibl7); const float32x4_t valphah0123 = vw0123.val[0]; const float32x4_t valphav0123 = vw0123.val[1]; const float32x4_t valphah4567 = vw4567.val[0]; const float32x4_t valphav4567 = vw4567.val[1]; const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5); const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5); const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7); const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7); const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45); const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67); const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23); const float32x4_t vld0123 = vld_t0123.val[0]; const float32x4_t vrd0123 = vld_t0123.val[1]; const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67); const float32x4_t vld4567 = vld_t4567.val[0]; const float32x4_t vrd4567 = vld_t4567.val[1]; const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23); const float32x4_t vtl0123 = vtl_t0123.val[0]; const float32x4_t vtr0123 = vtl_t0123.val[1]; const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67); const float32x4_t vtl4567 = vtl_t4567.val[0]; const float32x4_t vtr4567 = vtl_t4567.val[1]; const float32x4_t vl0123 = vfmaq_f32(vtl0123, vld0123, valphav0123); const float32x4_t vr0123 = vfmaq_f32(vtr0123, vrd0123, valphav0123); const float32x4_t vl4567 = vfmaq_f32(vtl4567, vld4567, valphav4567); const float32x4_t vr4567 = vfmaq_f32(vtr4567, vrd4567, valphav4567); const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123); const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567); const float32x4_t vo0123 = vfmaq_f32(vl0123, vd0123, valphah0123); const float32x4_t vo4567 = vfmaq_f32(vl4567, vd4567, valphah4567); vst1q_f32(output + 0, vo0123); vst1q_f32(output + 4, vo4567); output += 8; } for (; p >= 4; p -= 4) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); i += 8; const float32x4x2_t vw = vld2q_f32(w); w += 8; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vtltr2 = vld1_f32(itl2); const float32x2_t vblbr2 = vld1_f32(ibl2); const float32x2_t vtltr3 = vld1_f32(itl3); const float32x2_t vblbr3 = vld1_f32(ibl3); const float32x4_t valphah = vw.val[0]; const float32x4_t valphav = vw.val[1]; const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23); const float32x4_t vld = vld_t.val[0]; const float32x4_t vrd = vld_t.val[1]; const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23); const float32x4_t vtl = vtl_t.val[0]; const float32x4_t vtr = vtl_t.val[1]; const float32x4_t vl = vfmaq_f32(vtl, vld, valphav); const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav); const float32x4_t vd = vsubq_f32(vr, vl); const float32x4_t vo = vfmaq_f32(vl, vd, valphah); vst1q_f32(output, vo); output += 4; } if XNN_UNLIKELY(p != 0) { if (p & 2) { const float32x2x2_t vw = vld2_f32(w); w += 4; const float32x2_t valphah = vw.val[0]; const float32x2_t valphav = vw.val[1]; const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); i += 4; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0); const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1); const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1); const float32x2_t vld = vld_t.val[0]; const float32x2_t vrd = vld_t.val[1]; const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1); const float32x2_t vtl = vtl_t.val[0]; const float32x2_t vtr = vtl_t.val[1]; const float32x2_t vl = vfma_f32(vtl, vld, valphav); const float32x2_t vr = vfma_f32(vtr, vrd, valphav); const float32x2_t vd = vsub_f32(vr, vl); const float32x2_t vo = vfma_f32(vl, vd, valphah); vst1_f32(output, vo); output += 2; } if (p & 1) { // We are computing the following formula: // result = (1 - alpha_h) * (1 - alpha_v) * top_left + // alpha_h * (1 - alpha_v) * top_right + // (1 - alpha_h) * alpha_v * bottom_left + // alpha_h * alpha_v * bottom_right. // // Rearranging gives // result = left + alpha_h * (right - left), // where // left = top_left + alpha_v * (bottom_left - top_left), // right = top_right + alpha_v * (bottom_right - top_right). const float alphah = *w; const float32x2_t valphav = vld1_dup_f32(w + 1); w += 2; const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); i += 2; const float32x2_t vtltr = vld1_f32(itl); const float32x2_t vblbr = vld1_f32(ibl); // Compute at once // left_diff = bottom_left - top_left // right_diff = bottom_right - top_right const float32x2_t vldrd = vsub_f32(vblbr, vtltr); const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav); // Extract them and compute the result. const float l = vget_lane_f32(vlr, 0); const float r = vget_lane_f32(vlr, 1); *output++ = l + alphah * (r - l); } } input_offset += input_increment; } while (--channels != 0); } void xnn_f32_ibilinear_ukernel__neonfma_c8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); do { const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset); const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset); const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset); const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset); input += 4; const float32x2_t valphahv = vld1_f32(weights); weights += 2; #if XNN_ARCH_ARM const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); #endif size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vtl4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); const float32x4_t vtd4567 = vsubq_f32(vtr4567, vtl4567); const float32x4_t vbd4567 = vsubq_f32(vbr4567, vbl4567); #if XNN_ARCH_ARM const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); const float32x4_t vt4567 = vfmaq_f32(vtl4567, vtd4567, valphah); const float32x4_t vb4567 = vfmaq_f32(vbl4567, vbd4567, valphah); #else const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); const float32x4_t vt4567 = vfmaq_lane_f32(vtl4567, vtd4567, valphahv, 0); const float32x4_t vb4567 = vfmaq_lane_f32(vbl4567, vbd4567, valphahv, 0); #endif const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); const float32x4_t vd4567 = vsubq_f32(vb4567, vt4567); #if XNN_ARCH_ARM const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); const float32x4_t vo4567 = vfmaq_f32(vt4567, vd4567, valphav); #else const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); const float32x4_t vo4567 = vfmaq_lane_f32(vt4567, vd4567, valphahv, 1); #endif vst1q_f32(output, vo0123); output += 4; vst1q_f32(output, vo4567); output += 4; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); #if XNN_ARCH_ARM const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); #else const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); #endif const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); #if XNN_ARCH_ARM const float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); #else const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); #endif vst1q_f32(output, vo0123); output += 4; } if XNN_UNLIKELY(c != 0) { const float32x4_t vtl0123 = vld1q_f32(i0); const float32x4_t vtr0123 = vld1q_f32(i1); const float32x4_t vbl0123 = vld1q_f32(i2); const float32x4_t vbr0123 = vld1q_f32(i3); const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); #if XNN_ARCH_ARM const float32x4_t vt0123 = vfmaq_f32(vtl0123, vtd0123, valphah); const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); #else const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); #endif const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); #if XNN_ARCH_ARM float32x4_t vo0123 = vfmaq_f32(vt0123, vd0123, valphav); #else float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); #endif float32x2_t vo01 = vget_low_f32(vo0123); if (c & (2 * sizeof(float))) { vst1_f32(output, vo01); output += 2; vo01 = vget_high_f32(vo0123); } if (c & (1 * sizeof(float))) { vst1_lane_f32(output, vo01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; const float32x4_t vb0123c0 = vld1q_f32(w + 0); const float32x4_t vb4567c0 = vld1q_f32(w + 4); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c1 = vld1q_f32(w + 8); const float32x4_t vb4567c1 = vld1q_f32(w + 12); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c2 = vld1q_f32(w + 16); const float32x4_t vb4567c2 = vld1q_f32(w + 20); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c3 = vld1q_f32(w + 24); const float32x4_t vb4567c3 = vld1q_f32(w + 28); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); } p -= 1 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; float32x4_t va1 = vld1q_f32(a1); a1 += 4; float32x4_t va2 = vld1q_f32(a2); a2 += 4; float32x4_t va3 = vld1q_f32(a3); a3 += 4; const float32x4_t vb0123c0 = vld1q_f32(w + 0); const float32x4_t vb4567c0 = vld1q_f32(w + 4); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c1 = vld1q_f32(w + 8); const float32x4_t vb4567c1 = vld1q_f32(w + 12); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c2 = vld1q_f32(w + 16); const float32x4_t vb4567c2 = vld1q_f32(w + 20); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c3 = vld1q_f32(w + 24); const float32x4_t vb4567c3 = vld1q_f32(w + 28); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); } p -= 4 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 6); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (6 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { c3 = c2; } float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { c4 = c3; } float* c5 = (float*) ((uintptr_t) c4 + cm_stride); if XNN_UNPREDICTABLE(mr != 6) { c5 = c4; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; float32x4_t vacc4x0123 = vacc0x0123; float32x4_t vacc4x4567 = vacc0x4567; float32x4_t vacc5x0123 = vacc0x0123; float32x4_t vacc5x4567 = vacc0x4567; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } const float* restrict a4 = a[4]; assert(a4 != NULL); if XNN_UNPREDICTABLE(a4 != zero) { a4 = (const float*) ((uintptr_t) a4 + a_offset); } const float* restrict a5 = a[5]; assert(a5 != NULL); if XNN_UNPREDICTABLE(a5 != zero) { a5 = (const float*) ((uintptr_t) a5 + a_offset); } a += 6; size_t k = kc; while (k >= 4 * sizeof(float)) { float32x4_t va0 = vld1q_f32(a0); a0 += 4; float32x4_t va1 = vld1q_f32(a1); a1 += 4; float32x4_t va2 = vld1q_f32(a2); a2 += 4; float32x4_t va3 = vld1q_f32(a3); a3 += 4; float32x4_t va4 = vld1q_f32(a4); a4 += 4; float32x4_t va5 = vld1q_f32(a5); a5 += 4; const float32x4_t vb0123c0 = vld1q_f32(w + 0); const float32x4_t vb4567c0 = vld1q_f32(w + 4); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c0); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c0); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c0); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c0); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c0); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c0); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c0); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c0); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c1 = vld1q_f32(w + 8); const float32x4_t vb4567c1 = vld1q_f32(w + 12); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c1); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c1); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c1); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c1); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c1); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c1); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c1); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c1); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c2 = vld1q_f32(w + 16); const float32x4_t vb4567c2 = vld1q_f32(w + 20); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c2); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c2); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c2); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c2); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c2); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c2); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c2); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c2); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c3 = vld1q_f32(w + 24); const float32x4_t vb4567c3 = vld1q_f32(w + 28); vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c3); vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123c3); vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123c3); vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123c3); vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123c3); vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c3); vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c3); vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567c3); vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567c3); vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567c3); vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567c3); vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c3); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k); float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k); float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k); float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k); float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k); float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k); const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0); const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0); const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0); const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0); const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0); const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0); const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0); const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0); const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0); const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0); const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0); const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1); const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1); const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1); const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1); const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1); const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1); const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1); const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1); const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1); const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1); const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1); const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2); const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2); const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2); const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2); const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2); const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2); const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2); const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2); const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2); const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2); const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2); const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2); va0 = vextq_f32(va0, va0, 1); va1 = vextq_f32(va1, va1, 1); va2 = vextq_f32(va2, va2, 1); va3 = vextq_f32(va3, va3, 1); va4 = vextq_f32(va4, va4, 1); va5 = vextq_f32(va5, va5, 1); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc0x0123 = vfmaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3); const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc1x0123 = vfmaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3); const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc2x0123 = vfmaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3); const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc3x0123 = vfmaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3); const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc4x0123 = vfmaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3); const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f)))); vacc5x0123 = vfmaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3); const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc0x4567 = vfmaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3); const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc1x4567 = vfmaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3); const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc2x4567 = vfmaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3); const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc3x4567 = vfmaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3); const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc4x4567 = vfmaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3); const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f)))); vacc5x4567 = vfmaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3); } p -= 6 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc4x0123 = vminq_f32(vacc4x0123, vmax); vacc5x0123 = vminq_f32(vacc5x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); vacc4x4567 = vminq_f32(vacc4x4567, vmax); vacc5x4567 = vminq_f32(vacc5x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc4x0123 = vmaxq_f32(vacc4x0123, vmin); vacc5x0123 = vmaxq_f32(vacc5x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); vacc4x4567 = vmaxq_f32(vacc4x4567, vmin); vacc5x4567 = vmaxq_f32(vacc5x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c5, vacc5x0123); vst1q_f32(c5 + 4, vacc5x4567); c5 = (float*) ((uintptr_t) c5 + cn_stride); vst1q_f32(c4, vacc4x0123); vst1q_f32(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c5, vacc5x0123); c5 += 4; vst1q_f32(c4, vacc4x0123); c4 += 4; vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc5x0123 = vacc5x4567; vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc5x01 = vget_low_f32(vacc5x0123); float32x2_t vacc4x01 = vget_low_f32(vacc4x0123); float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c5, vacc5x01); c5 += 2; vst1_f32(c4, vacc4x01); c4 += 2; vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc5x01 = vget_high_f32(vacc5x0123); vacc4x01 = vget_high_f32(vacc4x0123); vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c5, vacc5x01, 0); vst1_lane_f32(c4, vacc4x01, 0); vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; void xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16( size_t batch, const float* input, const float* max, float* output, float* sum, const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(max != NULL); assert(output != NULL); assert(sum != NULL); const float32x4_t vi_max = vld1q_dup_f32(max); const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.log2e); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.magic_bias); const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.minus_ln2); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.c2); const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.denorm_cutoff); float32x4_t vacc0 = vmovq_n_f32(0.0f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const float32x4_t vi0123 = vld1q_f32(input); input += 4; const float32x4_t vi4567 = vld1q_f32(input); input += 4; const float32x4_t vi89AB = vld1q_f32(input); input += 4; const float32x4_t viCDEF = vld1q_f32(input); input += 4; const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); const float32x4_t vx89AB = vsubq_f32(vi89AB, vi_max); const float32x4_t vxCDEF = vsubq_f32(viCDEF, vi_max); float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vx0123, vlog2e); float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vx4567, vlog2e); float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vx89AB, vlog2e); float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vxCDEF, vlog2e); const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x3F))), 17); const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx89]); float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxAB]); float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxCD]); float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidxEF]); vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); const float32x4_t vl0123 = vcombine_f32(vl01, vl23); vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); const float32x4_t vl4567 = vcombine_f32(vl45, vl67); vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); vn89AB = vsubq_f32(vn89AB, vmagic_bias); vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); float32x4_t vt0123 = vfmaq_f32(vx0123, vn0123, vminus_ln2); float32x4_t vt4567 = vfmaq_f32(vx4567, vn4567, vminus_ln2); float32x4_t vt89AB = vfmaq_f32(vx89AB, vn89AB, vminus_ln2); float32x4_t vtCDEF = vfmaq_f32(vxCDEF, vnCDEF, vminus_ln2); float32x4_t vp0123 = vmulq_f32(vt0123, vc2); float32x4_t vp4567 = vmulq_f32(vt4567, vc2); float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); vp0123 = vfmaq_f32(vt0123, vt0123, vp0123); vp4567 = vfmaq_f32(vt4567, vt4567, vp4567); vp89AB = vfmaq_f32(vt89AB, vt89AB, vp89AB); vpCDEF = vfmaq_f32(vtCDEF, vtCDEF, vpCDEF); float32x4_t vf0123 = vfmaq_f32(vs0123, vs0123, vp0123); float32x4_t vf4567 = vfmaq_f32(vs4567, vs4567, vp4567); float32x4_t vf89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB); float32x4_t vfCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF); vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff))); vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff))); vst1q_f32(output, vf0123); output += 4; vst1q_f32(output, vf4567); output += 4; vst1q_f32(output, vf89AB); output += 4; vst1q_f32(output, vfCDEF); output += 4; vacc0 = vaddq_f32(vacc0, vf0123); vacc0 = vaddq_f32(vacc0, vf4567); vacc0 = vaddq_f32(vacc0, vf89AB); vacc0 = vaddq_f32(vacc0, vfCDEF); } float32x4_t vacc = vacc0; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vi = vld1q_f32(input); input += 4; const float32x4_t vx = vsubq_f32(vi, vi_max); float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); float32x4_t vp = vmulq_f32(vt, vc2); vp = vfmaq_f32(vt, vt, vp); float32x4_t vf = vfmaq_f32(vs, vs, vp); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); vst1q_f32(output, vf); output += 4; vacc = vaddq_f32(vacc, vf); } #if XNN_ARCH_ARM64 float vacc_lo = vaddvq_f32(vacc); #else float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); #endif if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 3 * sizeof(float)); const float32x4_t vi = vld1q_f32(input); input += 4; const float32x4_t vx = vsubq_f32(vi, vi_max); float32x4_t vn = vfmaq_f32(vmagic_bias, vx, vlog2e); const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vx, vn, vminus_ln2); float32x4_t vp = vmulq_f32(vt, vc2); vp = vfmaq_f32(vt, vt, vp); float32x4_t vf = vfmaq_f32(vs, vs, vp); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); float32x2_t vf_lo = vget_low_f32(vf); if (batch & (2 * sizeof(float))) { vst1_f32(output, vf_lo); output += 2; #if XNN_ARCH_ARM64 vacc_lo += vaddv_f32(vf_lo); #else vacc_lo = vadd_f32(vacc_lo, vf_lo); #endif vf_lo = vget_high_f32(vf); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vf_lo, 0); #if XNN_ARCH_ARM64 vacc_lo += vget_lane_f32(vf_lo, 0); #else vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); #endif } } #if XNN_ARCH_ARM64 *sum = vacc_lo; #else vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); #endif } void xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined( size_t mc, size_t nc, const float* input, const float* weights, const int32_t* widx_dmap, const uint32_t* nidx_nnzmap, float* output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mc != 0); assert(mc % sizeof(float) == 0); assert(nc != 0); #if XNN_ARCH_ARM64 const float32x4x2_t vminmax = vld2q_dup_f32(¶ms->scalar.min); const float32x4_t vmin = vminmax.val[0]; const float32x4_t vmax = vminmax.val[1]; #else const float32x2x2_t vminmax = vld2_dup_f32(¶ms->scalar.min); const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]); const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]); #endif size_t output_decrement = output_stride * nc - 32 * sizeof(float); while XNN_LIKELY(mc >= 32 * sizeof(float)) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; float32x4_t vw = vld1q_dup_f32(w); w += 1; intptr_t diff = *dmap++; float32x4_t vi0123 = vld1q_f32(input); float32x4_t vi4567 = vld1q_f32(input + 4); float32x4_t vi89AB = vld1q_f32(input + 8); float32x4_t viCDEF = vld1q_f32(input + 12); float32x4_t viGHIJ = vld1q_f32(input + 16); float32x4_t viKLMN = vld1q_f32(input + 20); float32x4_t viOPQR = vld1q_f32(input + 24); float32x4_t viSTUV = vld1q_f32(input + 28); size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vw; float32x4_t vacc4567 = vw; float32x4_t vacc89AB = vw; float32x4_t vaccCDEF = vw; float32x4_t vaccGHIJ = vw; float32x4_t vaccKLMN = vw; float32x4_t vaccOPQR = vw; float32x4_t vaccSTUV = vw; vw = vld1q_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { vacc0123 = vfmaq_f32(vacc0123, vi0123, vw); vacc4567 = vfmaq_f32(vacc4567, vi4567, vw); vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw); vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw); vaccGHIJ = vfmaq_f32(vaccGHIJ, viGHIJ, vw); vaccKLMN = vfmaq_f32(vaccKLMN, viKLMN, vw); vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw); vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); diff = *dmap++; vw = vld1q_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vi0123 = vld1q_f32(input); vi4567 = vld1q_f32(input + 4); vi89AB = vld1q_f32(input + 8); viCDEF = vld1q_f32(input + 12); viGHIJ = vld1q_f32(input + 16); viKLMN = vld1q_f32(input + 20); viOPQR = vld1q_f32(input + 24); viSTUV = vld1q_f32(input + 28); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax); float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax); float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax); float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vout89AB = vmaxq_f32(vout89AB, vmin); voutCDEF = vmaxq_f32(voutCDEF, vmin); voutGHIJ = vmaxq_f32(voutGHIJ, vmin); voutKLMN = vmaxq_f32(voutKLMN, vmin); voutOPQR = vmaxq_f32(voutOPQR, vmin); voutSTUV = vmaxq_f32(voutSTUV, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); vst1q_f32(output + 8, vout89AB); vst1q_f32(output + 12, voutCDEF); vst1q_f32(output + 16, voutGHIJ); vst1q_f32(output + 20, voutKLMN); vst1q_f32(output + 24, voutOPQR); vst1q_f32(output + 28, voutSTUV); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 32; mc -= 32 * sizeof(float); } if XNN_UNLIKELY(mc != 0) { output_decrement += 16 * sizeof(float); if (mc & (16 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; float32x4_t vacc4567 = vacc0123; float32x4_t vacc89AB = vacc0123; float32x4_t vaccCDEF = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); const float32x4_t vi4567 = vld1q_f32(input + 4); const float32x4_t vi89AB = vld1q_f32(input + 8); const float32x4_t viCDEF = vld1q_f32(input + 12); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x4_t vb = vld1q_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); vacc4567 = vfmaq_f32(vacc4567, vi4567, vb); vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vb); vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vb); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vout89AB = vmaxq_f32(vout89AB, vmin); voutCDEF = vmaxq_f32(voutCDEF, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); vst1q_f32(output + 8, vout89AB); vst1q_f32(output + 12, voutCDEF); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 16; } output_decrement += 8 * sizeof(float); if (mc & (8 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; float32x4_t vacc4567 = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); const float32x4_t vi4567 = vld1q_f32(input + 4); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x4_t vb = vld1q_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); vacc4567 = vfmaq_f32(vacc4567, vi4567, vb); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 8; } output_decrement += 4 * sizeof(float); if (mc & (4 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x4_t vb = vld1q_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc0123 = vfmaq_f32(vacc0123, vi0123, vb); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vst1q_f32(output, vout0123); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 4; } output_decrement += 2 * sizeof(float); if (mc & (2 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x2_t vacc01 = vld1_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x2_t vi01 = vld1_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x2_t vb = vld1_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc01 = vfma_f32(vacc01, vi01, vb); } while (--nnz != 0); } float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); vout01 = vmax_f32(vout01, vget_low_f32(vmin)); vst1_f32(output, vout01); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 2; } output_decrement += 1 * sizeof(float); if (mc & (1 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x2_t vacc0 = vld1_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x2_t vi0 = vld1_dup_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x2_t vb = vld1_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc0 = vfma_f32(vacc0, vi0, vb); } while (--nnz != 0); } float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax)); vout0 = vmax_f32(vout0, vget_low_f32(vmin)); vst1_lane_f32(output, vout0, 0); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 1; } } } extern XNN_INTERNAL const int32_t xnn_table_exp2minus_k_over_16[16]; void xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16( size_t batch, const float* input, float* output, const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vprescale = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.prescale); const float32x4_t valpha = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.alpha); const float32x4_t vbeta = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.beta); const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.sat_cutoff); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.magic_bias); const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.log2e); const int32x4_t vindex_mask = vmovq_n_s32(0xF); const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.minus_ln2); const float32x4_t vc3 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.c3); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut16_p3.c2); const float32x4_t vone = vmovq_n_f32(1.0f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vx89AB = vld1q_f32(input); input += 4; float32x4_t vxCDEF = vld1q_f32(input); input += 4; const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff); const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff); const float32x4_t vz89AB = vmaxq_f32(vmulq_f32(vx89AB, vprescale), vsat_cutoff); const float32x4_t vzCDEF = vmaxq_f32(vmulq_f32(vxCDEF, vprescale), vsat_cutoff); float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e); float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e); float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vlog2e); float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vlog2e); const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask), 2)); const int32x4_t ven0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 19); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask), 2)); const int32x4_t ven4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 19); const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask), 2)); const int32x4_t ven89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 19); const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask), 2)); const int32x4_t venCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 19); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); int32x2_t vl01 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01)); int32x2_t vl23 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23)); vl01 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)), vl01, 1); vl23 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)), vl23, 1); const int32x4_t vl0123 = vcombine_s32(vl01, vl23); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); int32x2_t vl45 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45)); int32x2_t vl67 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67)); vl45 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)), vl45, 1); vl67 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)), vl67, 1); const int32x4_t vl4567 = vcombine_s32(vl45, vl67); const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); int32x2_t vl89 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx89)); int32x2_t vlAB = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxAB)); vl89 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx89 >> 32)), vl89, 1); vlAB = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxAB >> 32)), vlAB, 1); const int32x4_t vl89AB = vcombine_s32(vl89, vlAB); const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); int32x2_t vlCD = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxCD)); int32x2_t vlEF = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxEF)); vlCD = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxCD >> 32)), vlCD, 1); vlEF = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxEF >> 32)), vlEF, 1); const int32x4_t vlCDEF = vcombine_s32(vlCD, vlEF); vn0123 = vsubq_f32(vn0123, vmagic_bias); float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vl0123, ven0123)); vn4567 = vsubq_f32(vn4567, vmagic_bias); float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vl4567, ven4567)); vn89AB = vsubq_f32(vn89AB, vmagic_bias); float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vl89AB, ven89AB)); vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vlCDEF, venCDEF)); float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2); float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2); float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vminus_ln2); float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vminus_ln2); float32x4_t vp0123 = vfmaq_f32(vc2, vc3, vt0123); float32x4_t vp4567 = vfmaq_f32(vc2, vc3, vt4567); float32x4_t vp89AB = vfmaq_f32(vc2, vc3, vt89AB); float32x4_t vpCDEF = vfmaq_f32(vc2, vc3, vtCDEF); vp0123 = vmulq_f32(vp0123, vt0123); vp4567 = vmulq_f32(vp4567, vt4567); vp89AB = vmulq_f32(vp89AB, vt89AB); vpCDEF = vmulq_f32(vpCDEF, vtCDEF); vt0123 = vmulq_f32(vt0123, vs0123); vs0123 = vsubq_f32(vs0123, vone); vt4567 = vmulq_f32(vt4567, vs4567); vs4567 = vsubq_f32(vs4567, vone); vt89AB = vmulq_f32(vt89AB, vs89AB); vs89AB = vsubq_f32(vs89AB, vone); vtCDEF = vmulq_f32(vtCDEF, vsCDEF); vsCDEF = vsubq_f32(vsCDEF, vone); vp0123 = vfmaq_f32(vt0123, vp0123, vt0123); vp4567 = vfmaq_f32(vt4567, vp4567, vt4567); vp89AB = vfmaq_f32(vt89AB, vp89AB, vt89AB); vpCDEF = vfmaq_f32(vtCDEF, vpCDEF, vtCDEF); const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha); const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha); const float32x4_t ve89AB = vmulq_f32(vaddq_f32(vp89AB, vs89AB), valpha); const float32x4_t veCDEF = vmulq_f32(vaddq_f32(vpCDEF, vsCDEF), valpha); const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); vx0123 = vmulq_f32(vx0123, vbeta); const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); vx4567 = vmulq_f32(vx4567, vbeta); const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f)); vx89AB = vmulq_f32(vx89AB, vbeta); const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); vxCDEF = vmulq_f32(vxCDEF, vbeta); const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123); const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567); const float32x4_t vy89AB = vbslq_f32(vm89AB, ve89AB, vx89AB); const float32x4_t vyCDEF = vbslq_f32(vmCDEF, veCDEF, vxCDEF); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; vst1q_f32(output, vy89AB); output += 4; vst1q_f32(output, vyCDEF); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); vn = vsubq_f32(vn, vmagic_bias); const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); float32x4_t vp = vfmaq_f32(vc2, vc3, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vfmaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { float32x4_t vx = vld1q_f32(input); const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); vn = vsubq_f32(vn, vmagic_bias); const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); float32x4_t vp = vfmaq_f32(vc2, vc3, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vfmaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_velu_ukernel__neonfma_rr1_p6_x8( size_t batch, const float* input, float* output, const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vprescale = vld1q_dup_f32(¶ms->neonfma_rr1_p6.prescale); const float32x4_t valpha = vld1q_dup_f32(¶ms->neonfma_rr1_p6.alpha); const float32x4_t vbeta = vld1q_dup_f32(¶ms->neonfma_rr1_p6.beta); const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_p6.sat_cutoff); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_p6.magic_bias); const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neonfma_rr1_p6.log2e); const float32x4_t vminus_ln2 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.minus_ln2); const float32x4_t vc6 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c6); const float32x4_t vc5 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c5); const float32x4_t vc4 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c4); const float32x4_t vc3 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c3); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_p6.c2); const float32x4_t vone = vmovq_n_f32(1.0f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff); const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff); float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vlog2e); float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vlog2e); float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); vn0123 = vsubq_f32(vn0123, vmagic_bias); float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); vn4567 = vsubq_f32(vn4567, vmagic_bias); float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vminus_ln2); float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vminus_ln2); float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123); float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567); vp0123 = vfmaq_f32(vc4, vp0123, vt0123); vp4567 = vfmaq_f32(vc4, vp4567, vt4567); vp0123 = vfmaq_f32(vc3, vp0123, vt0123); vp4567 = vfmaq_f32(vc3, vp4567, vt4567); vp0123 = vfmaq_f32(vc2, vp0123, vt0123); vp4567 = vfmaq_f32(vc2, vp4567, vt4567); vp0123 = vmulq_f32(vp0123, vt0123); vp4567 = vmulq_f32(vp4567, vt4567); vt0123 = vmulq_f32(vt0123, vs0123); vs0123 = vsubq_f32(vs0123, vone); vt4567 = vmulq_f32(vt4567, vs4567); vs4567 = vsubq_f32(vs4567, vone); vp0123 = vfmaq_f32(vt0123, vp0123, vt0123); vp4567 = vfmaq_f32(vt4567, vp4567, vt4567); const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha); const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha); const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); vx0123 = vmulq_f32(vx0123, vbeta); const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); vx4567 = vmulq_f32(vx4567, vbeta); const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123); const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); float32x4_t vp = vfmaq_f32(vc5, vc6, vt); vp = vfmaq_f32(vc4, vp, vt); vp = vfmaq_f32(vc3, vp, vt); vp = vfmaq_f32(vc2, vp, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vfmaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { float32x4_t vx = vld1q_f32(input); const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vlog2e); float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vz, vn, vminus_ln2); float32x4_t vp = vfmaq_f32(vc5, vc6, vt); vp = vfmaq_f32(vc4, vp, vt); vp = vfmaq_f32(vc3, vp, vt); vp = vfmaq_f32(vc2, vp, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vfmaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const float32x4_t vscale0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x0123 = vld1q_f32(i0); i0 += 4; float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vbias0123 = vld1q_f32(w); w += 4; vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123); vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vst1q_f32(o0, vacc0x0123); o0 += 4; vst1q_f32(o1, vacc1x0123); o1 += 4; } if XNN_UNLIKELY(c != 0) { const float32x4_t vscale0123 = vld1q_f32(w); float32x4_t vacc0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c); float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); const float32x4_t vbias0123 = vld1q_f32(w + 4); vacc0x0123 = vfmaq_f32(vbias0123, vscale0123, vacc0x0123); vacc1x0123 = vfmaq_f32(vbias0123, vscale0123, vacc1x0123); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); if (c & (2 * sizeof(float))) { vst1_f32(o0, vacc0x01); o0 += 2; vst1_f32(o1, vacc1x01); o1 += 2; vacc0x01 = vget_high_f32(vacc0x0123); vacc1x01 = vget_high_f32(vacc1x0123); } if (c & (1 * sizeof(float))) { vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64]; void xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16( size_t batch, const float* input, float* output, const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.magic_bias); const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.minus_log2e); const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); const float32x4_t vln2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.ln2); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.c2); const float32x4_t vone = vmovq_n_f32(1.0f); const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neonfma_rr1_lut64_p2.denorm_cutoff); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vx89AB = vld1q_f32(input); input += 4; const float32x4_t vxCDEF = vld1q_f32(input); input += 4; const float32x4_t vz0123 = vabsq_f32(vx0123); const float32x4_t vz4567 = vabsq_f32(vx4567); const float32x4_t vz89AB = vabsq_f32(vx89AB); const float32x4_t vzCDEF = vabsq_f32(vxCDEF); float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e); float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e); float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e); float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e); const int32x4_t ve0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 17); const int32x4_t ve4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 17); const int32x4_t ve89AB = vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 17); const int32x4_t veCDEF = vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 17); // Use bits 0:6 bits of batch, as integer, as an index for table lookup of l := 2**(batch % 64). const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask)); const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask)); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx01]); float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx23]); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx45]); float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx67]); const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0); const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1); float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx89]); float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxAB]); const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0); const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1); float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxCD]); float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidxEF]); vl01 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); vl23 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); const float32x4_t vl0123 = vcombine_f32(vl01, vl23); vl45 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); vl67 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); const float32x4_t vl4567 = vcombine_f32(vl45, vl67); vl89 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx89 >> 32)], vl89, 1); vlAB = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxAB >> 32)], vlAB, 1); const float32x4_t vl89AB = vcombine_f32(vl89, vlAB); vlCD = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxCD >> 32)], vlCD, 1); vlEF = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidxEF >> 32)], vlEF, 1); const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF); const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB)); const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); vn89AB = vsubq_f32(vn89AB, vmagic_bias); vnCDEF = vsubq_f32(vnCDEF, vmagic_bias); float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2); float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2); float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2); float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2); float32x4_t vp0123 = vmulq_f32(vt0123, vc2); float32x4_t vp4567 = vmulq_f32(vt4567, vc2); float32x4_t vp89AB = vmulq_f32(vt89AB, vc2); float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc2); vp0123 = vfmsq_f32(vt0123, vp0123, vt0123); vp4567 = vfmsq_f32(vt4567, vp4567, vt4567); vp89AB = vfmsq_f32(vt89AB, vp89AB, vt89AB); vpCDEF = vfmsq_f32(vtCDEF, vpCDEF, vtCDEF); const float32x4_t vy0123 = vfmsq_f32(vs0123, vs0123, vp0123); const float32x4_t vy4567 = vfmsq_f32(vs4567, vs4567, vp4567); const float32x4_t vy89AB = vfmsq_f32(vs89AB, vs89AB, vp89AB); const float32x4_t vyCDEF = vfmsq_f32(vsCDEF, vsCDEF, vpCDEF); const float32x4_t vd0123 = vaddq_f32(vy0123, vone); const float32x4_t vd4567 = vaddq_f32(vy4567, vone); const float32x4_t vd89AB = vaddq_f32(vy89AB, vone); const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone); float32x4_t vr0123 = vrecpeq_f32(vd0123); float32x4_t vr4567 = vrecpeq_f32(vd4567); float32x4_t vr89AB = vrecpeq_f32(vd89AB); float32x4_t vrCDEF = vrecpeq_f32(vdCDEF); vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB)); vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF)); vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB)); vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF)); float32x4_t vf0123 = vmulq_f32(vy0123, vr0123); float32x4_t vf4567 = vmulq_f32(vy4567, vr4567); float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB); float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF); vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff))); vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff))); vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff))); vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff))); const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f)); const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f)); vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123)); vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567)); vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB)); vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF)); vst1q_f32(output, vf0123); output += 4; vst1q_f32(output, vf4567); output += 4; vst1q_f32(output, vf89AB); output += 4; vst1q_f32(output, vfCDEF); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vz = vabsq_f32(vx); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vz, vn, vln2); float32x4_t vp = vmulq_f32(vt, vc2); vp = vfmsq_f32(vt, vp, vt); const float32x4_t vy = vfmsq_f32(vs, vs, vp); const float32x4_t vd = vaddq_f32(vy, vone); float32x4_t vr = vrecpeq_f32(vd); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); float32x4_t vf = vmulq_f32(vy, vr); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); vst1q_f32(output, vf); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vz = vabsq_f32(vx); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vfmaq_f32(vz, vn, vln2); float32x4_t vp = vmulq_f32(vt, vc2); vp = vfmsq_f32(vt, vp, vt); const float32x4_t vy = vfmsq_f32(vs, vs, vp); const float32x4_t vd = vaddq_f32(vy, vone); float32x4_t vr = vrecpeq_f32(vd); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); float32x4_t vf = vmulq_f32(vy, vr); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); float32x2_t vf_lo = vget_low_f32(vf); if (batch & (2 * sizeof(float))) { vst1_f32(output, vf_lo); output += 2; vf_lo = vget_high_f32(vf); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vf_lo, 0); } } } void xnn_f32_vtanh_ukernel__neonfma_expm1minus_rr1_p6h5ts_nr2fma_x8( size_t batch, const float* input, float* output, const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.sat_cutoff); const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.minus_log2e); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.magic_bias); const float32x4_t vln2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.ln2); const float32x4_t vc6 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c6); const float32x4_t vc5 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c5); const float32x4_t vc4 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c4); const float32x4_t vc3 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c3); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c2); const float32x4_t vone = vmovq_n_f32(1.0f); const float32x4_t vtwo = vmovq_n_f32(2.0f); const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vz0123 = vabsq_f32(vx0123); float32x4_t vz4567 = vabsq_f32(vx4567); vz0123 = vminq_f32(vz0123, vsat_cutoff); vz4567 = vminq_f32(vz4567, vsat_cutoff); float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e); float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e); const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); const float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2); const float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2); float32x4_t vp0123 = vfmaq_f32(vc5, vc6, vt0123); float32x4_t vp4567 = vfmaq_f32(vc5, vc6, vt4567); vp0123 = vfmaq_f32(vc4, vp0123, vt0123); vp0123 = vfmaq_f32(vc3, vp0123, vt0123); vp0123 = vfmaq_f32(vc2, vp0123, vt0123); vp4567 = vfmaq_f32(vc4, vp4567, vt4567); vp4567 = vfmaq_f32(vc3, vp4567, vt4567); vp4567 = vfmaq_f32(vc2, vp4567, vt4567); vp0123 = vfmsq_f32(vtwo, vp0123, vt0123); vp4567 = vfmsq_f32(vtwo, vp4567, vt4567); const float32x4_t vts0123 = vmulq_f32(vt0123, vs0123); const float32x4_t vsmo0123 = vsubq_f32(vs0123, vone); const float32x4_t vts4567 = vmulq_f32(vt4567, vs4567); const float32x4_t vsmo4567 = vsubq_f32(vs4567, vone); const float32x4_t vemo0123 = vfmsq_f32(vsmo0123, vp0123, vts0123); const float32x4_t vemo4567 = vfmsq_f32(vsmo4567, vp4567, vts4567); const float32x4_t vepo0123 = vaddq_f32(vemo0123, vtwo); const float32x4_t vepo4567 = vaddq_f32(vemo4567, vtwo); float32x4_t vrepo0123 = vrecpeq_f32(vepo0123); float32x4_t vrepo4567 = vrecpeq_f32(vepo4567); float32x4_t verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123); float32x4_t verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567); vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123); vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567); verepo0123 = vfmsq_f32(vone, vrepo0123, vepo0123); verepo4567 = vfmsq_f32(vone, vrepo4567, vepo4567); vrepo0123 = vfmaq_f32(vrepo0123, vrepo0123, verepo0123); vrepo4567 = vfmaq_f32(vrepo4567, vrepo4567, verepo4567); float32x4_t vy0123 = vmulq_f32(vemo0123, vrepo0123); float32x4_t vy4567 = vmulq_f32(vemo4567, vrepo4567); vy0123 = vbslq_f32(vsign_mask, vx0123, vy0123); vy4567 = vbslq_f32(vsign_mask, vx4567, vy4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; float32x4_t vz = vabsq_f32(vx); vz = vminq_f32(vz, vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); const float32x4_t vt = vfmaq_f32(vz, vn, vln2); float32x4_t vp = vfmaq_f32(vc5, vc6, vt); vp = vfmaq_f32(vc4, vp, vt); vp = vfmaq_f32(vc3, vp, vt); vp = vfmaq_f32(vc2, vp, vt); vp = vfmsq_f32(vtwo, vp, vt); const float32x4_t vts = vmulq_f32(vt, vs); const float32x4_t vsmo = vsubq_f32(vs, vone); const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts); const float32x4_t vepo = vaddq_f32(vemo, vtwo); float32x4_t vrepo = vrecpeq_f32(vepo); float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo); vrepo = vfmaq_f32(vrepo, vrepo, verepo); verepo = vfmsq_f32(vone, vrepo, vepo); vrepo = vfmaq_f32(vrepo, vrepo, verepo); float32x4_t vy = vmulq_f32(vemo, vrepo); vy = vbslq_f32(vsign_mask, vx, vy); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); float32x4_t vz = vabsq_f32(vx); vz = vminq_f32(vz, vsat_cutoff); float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e); const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); const float32x4_t vt = vfmaq_f32(vz, vn, vln2); float32x4_t vp = vfmaq_f32(vc5, vc6, vt); vp = vfmaq_f32(vc4, vp, vt); vp = vfmaq_f32(vc3, vp, vt); vp = vfmaq_f32(vc2, vp, vt); vp = vfmsq_f32(vtwo, vp, vt); const float32x4_t vts = vmulq_f32(vt, vs); const float32x4_t vsmo = vsubq_f32(vs, vone); const float32x4_t vemo = vfmsq_f32(vsmo, vp, vts); const float32x4_t vepo = vaddq_f32(vemo, vtwo); float32x4_t vrepo = vrecpeq_f32(vepo); float32x4_t verepo = vfmsq_f32(vone, vrepo, vepo); vrepo = vfmaq_f32(vrepo, vrepo, verepo); verepo = vfmsq_f32(vone, vrepo, vepo); vrepo = vfmaq_f32(vrepo, vrepo, verepo); float32x4_t vy = vmulq_f32(vemo, vrepo); vy = vbslq_f32(vsign_mask, vx, vy); float32x2_t vy_low = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_low); output += 2; vy_low = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_low, 0); } } }