// Copyright 2021 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void xnn_f16_f32_vcvt_ukernel__neon_int16_x16( size_t batch, const void* input, float* output, const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const uint16x8_t vh0 = vld1q_u16(i); i += 8; const uint16x8_t vh1 = vld1q_u16(i); i += 8; const uint16x8_t vsign0 = vandq_u16(vh0, vsign_mask); const uint16x8_t vsign1 = vandq_u16(vh1, vsign_mask); const uint16x8_t vnonsign0 = veorq_u16(vh0, vsign0); const uint16x8_t vnonsign1 = veorq_u16(vh1, vsign1); const uint16x8x2_t vprenorm0 = vzipq_u16(vshlq_n_u16(vnonsign0, 13), vsraq_n_u16(vexp_offset, vnonsign0, 3)); const uint16x8x2_t vprenorm1 = vzipq_u16(vshlq_n_u16(vnonsign1, 13), vsraq_n_u16(vexp_offset, vnonsign1, 3)); const float32x4_t vnorm0 = vmulq_f32(vreinterpretq_f32_u16(vprenorm0.val[0]), vexp_scale); const float32x4_t vnorm1 = vmulq_f32(vreinterpretq_f32_u16(vprenorm0.val[1]), vexp_scale); const float32x4_t vnorm2 = vmulq_f32(vreinterpretq_f32_u16(vprenorm1.val[0]), vexp_scale); const float32x4_t vnorm3 = vmulq_f32(vreinterpretq_f32_u16(vprenorm1.val[1]), vexp_scale); const float32x4_t vdenorm0 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign0))), vreinterpretq_f32_u32(vmagic_bias)); const float32x4_t vdenorm1 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign0))), vreinterpretq_f32_u32(vmagic_bias)); const float32x4_t vdenorm2 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign1))), vreinterpretq_f32_u32(vmagic_bias)); const float32x4_t vdenorm3 = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign1))), vreinterpretq_f32_u32(vmagic_bias)); const uint16x8_t vmask0 = vcgtq_u16(vnonsign0, vdenorm_cutoff); const uint16x8_t vmask1 = vcgtq_u16(vnonsign1, vdenorm_cutoff); const uint32x4_t vxmask0 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask0)))); const uint32x4_t vf0 = vorrq_u32(vshll_n_u16(vget_low_u16(vsign0), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask0, vnorm0, vdenorm0))); const uint32x4_t vxmask2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask1)))); const uint32x4_t vf2 = vorrq_u32(vshll_n_u16(vget_low_u16(vsign1), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask2, vnorm2, vdenorm2))); const uint32x4_t vxmask1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask0)))); const uint32x4_t vf1 = vorrq_u32(vshll_n_u16(vget_high_u16(vsign0), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask1, vnorm1, vdenorm1))); const uint32x4_t vxmask3 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask1)))); const uint32x4_t vf3 = vorrq_u32(vshll_n_u16(vget_high_u16(vsign1), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask3, vnorm3, vdenorm3))); vst1q_f32(output, vreinterpretq_f32_u32(vf0)); output += 4; vst1q_f32(output, vreinterpretq_f32_u32(vf1)); output += 4; vst1q_f32(output, vreinterpretq_f32_u32(vf2)); output += 4; vst1q_f32(output, vreinterpretq_f32_u32(vf3)); output += 4; } for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const uint16x8_t vh = vld1q_u16(i); i += 8; const uint16x8_t vsign = vandq_u16(vh, vsign_mask); const uint16x8_t vnonsign = veorq_u16(vh, vsign); const uint16x8x2_t vprenorm = vzipq_u16(vshlq_n_u16(vnonsign, 13), vsraq_n_u16(vexp_offset, vnonsign, 3)); const float32x4_t vnorm_lo = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[0]), vexp_scale); const float32x4_t vnorm_hi = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[1]), vexp_scale); const float32x4_t vdenorm_lo = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias)); const float32x4_t vdenorm_hi = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias)); const uint16x8_t vmask = vcgtq_u16(vnonsign, vdenorm_cutoff); const uint32x4_t vxmask_lo = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask)))); const uint32x4_t vf_lo = vorrq_u32(vshll_n_u16(vget_low_u16(vsign), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask_lo, vnorm_lo, vdenorm_lo))); const uint32x4_t vxmask_hi = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask)))); const uint32x4_t vf_hi = vorrq_u32(vshll_n_u16(vget_high_u16(vsign), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask_hi, vnorm_hi, vdenorm_hi))); vst1q_f32(output, vreinterpretq_f32_u32(vf_lo)); output += 4; vst1q_f32(output, vreinterpretq_f32_u32(vf_hi)); output += 4; } if XNN_UNPREDICTABLE(batch != 0) { const uint16x8_t vh = vld1q_u16(i); i += 8; const uint16x8_t vsign = vandq_u16(vh, vsign_mask); const uint16x8_t vnonsign = veorq_u16(vh, vsign); const uint16x8x2_t vprenorm = vzipq_u16(vshlq_n_u16(vnonsign, 13), vsraq_n_u16(vexp_offset, vnonsign, 3)); const float32x4_t vnorm_lo = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[0]), vexp_scale); const float32x4_t vnorm_hi = vmulq_f32(vreinterpretq_f32_u16(vprenorm.val[1]), vexp_scale); const float32x4_t vdenorm_lo = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_low_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias)); const float32x4_t vdenorm_hi = vsubq_f32(vreinterpretq_f32_u32(vaddw_u16(vmagic_bias, vget_high_u16(vnonsign))), vreinterpretq_f32_u32(vmagic_bias)); const uint16x8_t vmask = vcgtq_u16(vnonsign, vdenorm_cutoff); const uint32x4_t vxmask_lo = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vmask)))); uint32x4_t vf = vorrq_u32(vshll_n_u16(vget_low_u16(vsign), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask_lo, vnorm_lo, vdenorm_lo))); if (batch & (4 * sizeof(uint16_t))) { vst1q_f32(output, vreinterpretq_f32_u32(vf)); output += 4; const uint32x4_t vxmask_hi = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vmask)))); vf = vorrq_u32(vshll_n_u16(vget_high_u16(vsign), 16), vreinterpretq_u32_f32(vbslq_f32(vxmask_hi, vnorm_hi, vdenorm_hi))); } uint32x2_t vf_lo = vget_low_u32(vf); if (batch & (2 * sizeof(uint16_t))) { vst1_f32(output, vreinterpret_f32_u32(vf_lo)); output += 2; vf_lo = vget_high_u32(vf); } if (batch & (1 * sizeof(uint16_t))) { vst1_lane_f32(output, vreinterpret_f32_u32(vf_lo), 0); } } } void xnn_f32_argmaxpool_ukernel_4x__neon_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* output, uint32_t* index, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements <= 4); assert(channels != 0); do { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); if (pooling_elements < 2) { i1 = i0; } if (pooling_elements <= 2) { i2 = i0; } if (pooling_elements != 4) { i3 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; float32x4_t vmax = vi0; uint32x4_t vidx = vmovq_n_u32(0); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx); vst1q_f32(output, vmax); output += 4; vst1q_u32(index, vidx); index += 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); float32x4_t vmax = vi0; uint32x4_t vidx = vmovq_n_u32(0); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx); float32x2_t vmax_lo = vget_low_f32(vmax); uint32x2_t vidx_lo = vget_low_u32(vidx); if (c & 2) { vst1_f32(output, vmax_lo); output += 2; vst1_u32(index, vidx_lo); index += 2; vmax_lo = vget_high_f32(vmax); vidx_lo = vget_high_u32(vidx); } if (c & 1) { vst1_lane_f32(output, vmax_lo, 0); output += 1; vst1_lane_u32(index, vidx_lo, 0); index += 1; } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_argmaxpool_ukernel_9p8x__neon_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* accumulation_buffer, uint32_t* index_buffer, float* output, uint32_t* index, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements > 9); assert(channels != 0); do { { float* ab = accumulation_buffer; uint32_t* ib = index_buffer; const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; const float* i8 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; float32x4_t vmax = vi0; uint32x4_t vidx = vmovq_n_u32(0); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx); const uint32x4_t vm8 = vcgtq_f32(vi8, vmax); vmax = vbslq_f32(vm8, vi8, vmax); vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx); vst1q_f32(ab, vmax); ab += 4; vst1q_u32(ib, vidx); ib += 4; } } const uint32x4_t v1 = vmovq_n_u32(1); const uint32x4_t v8 = vmovq_n_u32(8); uint32x4_t vidx0 = vaddq_u32(v1, v8); size_t k = pooling_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); float* ab = accumulation_buffer; uint32_t* ib = index_buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; float32x4_t vmax = vld1q_f32(ab); uint32x4_t vidx = vld1q_u32(ib); const uint32x4_t vm0 = vcgtq_f32(vi0, vmax); vmax = vbslq_f32(vm0, vi0, vmax); vidx = vbslq_u32(vm0, vidx0, vidx); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); const uint32x4_t vidx1 = vaddq_u32(vidx0, v1); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vidx1, vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); const uint32x4_t vidx2 = vaddq_u32(vidx1, v1); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vidx2, vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); const uint32x4_t vidx3 = vaddq_u32(vidx2, v1); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vidx3, vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); const uint32x4_t vidx4 = vaddq_u32(vidx3, v1); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vidx4, vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); const uint32x4_t vidx5 = vaddq_u32(vidx4, v1); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vidx5, vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); const uint32x4_t vidx6 = vaddq_u32(vidx5, v1); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vidx6, vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); const uint32x4_t vidx7 = vaddq_u32(vidx6, v1); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vidx7, vidx); vst1q_f32(ab, vmax); ab += 4; vst1q_u32(ib, vidx); ib += 4; } vidx0 = vaddq_u32(vidx0, v8); } float* o = output; uint32_t* i = index; { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k != 8) { i7 = i0; } size_t c = channels; float* ab = accumulation_buffer; uint32_t* ib = index_buffer; for (; c >= 4; c -= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; float32x4_t vmax = vld1q_f32(ab); ab += 4; uint32x4_t vidx = vld1q_u32(ib); ib += 4; const uint32x4_t vm0 = vcgtq_f32(vi0, vmax); vmax = vbslq_f32(vm0, vi0, vmax); vidx = vbslq_u32(vm0, vidx0, vidx); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); const uint32x4_t vidx1 = vaddq_u32(vidx0, v1); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vidx1, vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); const uint32x4_t vidx2 = vaddq_u32(vidx1, v1); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vidx2, vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); const uint32x4_t vidx3 = vaddq_u32(vidx2, v1); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vidx3, vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); const uint32x4_t vidx4 = vaddq_u32(vidx3, v1); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vidx4, vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); const uint32x4_t vidx5 = vaddq_u32(vidx4, v1); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vidx5, vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); const uint32x4_t vidx6 = vaddq_u32(vidx5, v1); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vidx6, vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); const uint32x4_t vidx7 = vaddq_u32(vidx6, v1); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vidx7, vidx); vst1q_f32(o, vmax); o += 4; vst1q_u32(i, vidx); i += 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); float32x4_t vmax = vld1q_f32(ab); uint32x4_t vidx = vld1q_u32(ib); const uint32x4_t vm0 = vcgtq_f32(vi0, vmax); vmax = vbslq_f32(vm0, vi0, vmax); vidx = vbslq_u32(vm0, vidx0, vidx); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); const uint32x4_t vidx1 = vaddq_u32(vidx0, v1); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vidx1, vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); const uint32x4_t vidx2 = vaddq_u32(vidx1, v1); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vidx2, vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); const uint32x4_t vidx3 = vaddq_u32(vidx2, v1); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vidx3, vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); const uint32x4_t vidx4 = vaddq_u32(vidx3, v1); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vidx4, vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); const uint32x4_t vidx5 = vaddq_u32(vidx4, v1); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vidx5, vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); const uint32x4_t vidx6 = vaddq_u32(vidx5, v1); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vidx6, vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); const uint32x4_t vidx7 = vaddq_u32(vidx6, v1); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vidx7, vidx); float32x2_t vmax_lo = vget_low_f32(vmax); uint32x2_t vidx_lo = vget_low_u32(vidx); if (c & 2) { vst1_f32(o, vmax_lo); o += 2; vst1_u32(i, vidx_lo); i += 2; vmax_lo = vget_high_f32(vmax); vidx_lo = vget_high_u32(vidx); } if (c & 1) { vst1_lane_f32(o, vmax_lo, 0); o += 1; vst1_lane_u32(i, vidx_lo, 0); i += 1; } } } output = (float*) ((uintptr_t) o + output_increment); index = (uint32_t*) i; } while (--output_pixels != 0); } void xnn_f32_argmaxpool_ukernel_9x__neon_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* output, uint32_t* index, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements <= 9); assert(channels != 0); do { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); if (pooling_elements < 2) { i1 = i0; } if (pooling_elements <= 2) { i2 = i0; } if (pooling_elements < 4) { i3 = i0; } if (pooling_elements <= 4) { i4 = i0; } if (pooling_elements < 6) { i5 = i0; } if (pooling_elements <= 6) { i6 = i0; } if (pooling_elements < 8) { i7 = i0; } if (pooling_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; float32x4_t vmax = vi0; uint32x4_t vidx = vmovq_n_u32(0); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx); const uint32x4_t vm8 = vcgtq_f32(vi8, vmax); vmax = vbslq_f32(vm8, vi8, vmax); vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx); vst1q_f32(output, vmax); output += 4; vst1q_u32(index, vidx); index += 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vi8 = vld1q_f32(i8); float32x4_t vmax = vi0; uint32x4_t vidx = vmovq_n_u32(0); const uint32x4_t vm1 = vcgtq_f32(vi1, vmax); vmax = vbslq_f32(vm1, vi1, vmax); vidx = vbslq_u32(vm1, vmovq_n_u32(1), vidx); const uint32x4_t vm2 = vcgtq_f32(vi2, vmax); vmax = vbslq_f32(vm2, vi2, vmax); vidx = vbslq_u32(vm2, vmovq_n_u32(2), vidx); const uint32x4_t vm3 = vcgtq_f32(vi3, vmax); vmax = vbslq_f32(vm3, vi3, vmax); vidx = vbslq_u32(vm3, vmovq_n_u32(3), vidx); const uint32x4_t vm4 = vcgtq_f32(vi4, vmax); vmax = vbslq_f32(vm4, vi4, vmax); vidx = vbslq_u32(vm4, vmovq_n_u32(4), vidx); const uint32x4_t vm5 = vcgtq_f32(vi5, vmax); vmax = vbslq_f32(vm5, vi5, vmax); vidx = vbslq_u32(vm5, vmovq_n_u32(5), vidx); const uint32x4_t vm6 = vcgtq_f32(vi6, vmax); vmax = vbslq_f32(vm6, vi6, vmax); vidx = vbslq_u32(vm6, vmovq_n_u32(6), vidx); const uint32x4_t vm7 = vcgtq_f32(vi7, vmax); vmax = vbslq_f32(vm7, vi7, vmax); vidx = vbslq_u32(vm7, vmovq_n_u32(7), vidx); const uint32x4_t vm8 = vcgtq_f32(vi8, vmax); vmax = vbslq_f32(vm8, vi8, vmax); vidx = vbslq_u32(vm8, vmovq_n_u32(8), vidx); float32x2_t vmax_lo = vget_low_f32(vmax); uint32x2_t vidx_lo = vget_low_u32(vidx); if (c & 2) { vst1_f32(output, vmax_lo); output += 2; vst1_u32(index, vidx_lo); index += 2; vmax_lo = vget_high_f32(vmax); vidx_lo = vget_high_u32(vidx); } if (c & 1) { vst1_lane_f32(output, vmax_lo, 0); output += 1; vst1_lane_u32(index, vidx_lo, 0); index += 1; } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); vst1q_f32(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); vst1q_f32(b, vsum); b += 4; } } assert(k >= 1); { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } size_t c = channels; float* b = buffer; while (c >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vacc = vld1q_f32(b); b += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); vst1q_f32(output, vout); output += 4; c -= 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(output, vout_lo, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9x__neon_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } size_t c = channels; while (c >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); vst1q_f32(output, vout); output += 4; c -= 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vi8 = vld1q_f32(i8); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(output, vout_lo, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2( size_t input_height, size_t input_width, size_t output_y_start, size_t output_y_end, const float* input, const float* zero, const float* weights, float* output, size_t input_padding_top, size_t output_channels, size_t output_height_stride, size_t output_channel_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_width != 0); assert(output_y_end > output_y_start); assert(input_padding_top <= 1); assert(output_channels != 0); const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float); const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float); const size_t output_width = (input_width + 1) / 2; const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float); // Adjustment for padding processed below const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top)); const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride); const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride); const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride); const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride); float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start); float* output1 = (float*) ((uintptr_t) output0 + output_height_stride); if XNN_UNPREDICTABLE(output_y_start < input_padding_top) { i0 = zero; } const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) { const size_t input_y2 = output_y * 2 + 2 - input_padding_top; const size_t input_y4 = input_y2 + 2; if XNN_UNPREDICTABLE(input_y2 >= input_height) { i2 = zero; } if XNN_UNPREDICTABLE(input_y4 > input_height) { i3 = zero; } if XNN_UNPREDICTABLE(input_y4 >= input_height) { i4 = zero; } if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) { output1 = output0; } const float* w = weights; size_t c = output_channels; float* o0c0 = output0; float* o1c0 = output1; float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride); float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride); float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride); float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride); float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride); float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride); do { if XNN_UNPREDICTABLE(c < 2) { o0c1 = o0c0; o1c1 = o1c0; } if XNN_UNPREDICTABLE(c <= 2) { o0c2 = o0c1; o1c2 = o1c1; } if XNN_UNPREDICTABLE(c < 4) { o0c3 = o0c2; o1c3 = o1c2; } // viMx0 = ( iM0c2, iM0c1, iM0c0, --- ) float32x4_t vi0x0 = vmovq_n_f32(0.0f); float32x4_t vi1x0 = vmovq_n_f32(0.0f); float32x4_t vi2x0 = vmovq_n_f32(0.0f); float32x4_t vi3x0 = vmovq_n_f32(0.0f); float32x4_t vi4x0 = vmovq_n_f32(0.0f); size_t iw = input_width; for (; iw >= 4; iw -= 4) { float32x4_t vo0x0 = vld1q_f32(w); float32x4_t vo1x0 = vo0x0; float32x4_t vo0x1 = vo0x0; float32x4_t vo1x1 = vo0x0; const float32x4_t vk00c0 = vld1q_f32(w + 4); // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 ) const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4; vo0x0 = vmlaq_lane_f32(vo0x0, vk00c0, vget_low_f32(vi0x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c0, vget_low_f32(vi2x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk00c0, vget_high_f32(vi0x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c0, vget_high_f32(vi2x1), 1); const float32x4_t vk10c0 = vld1q_f32(w + 8); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c0, vget_low_f32(vi1x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c0, vget_low_f32(vi3x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk10c0, vget_high_f32(vi1x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c0, vget_high_f32(vi3x1), 1); const float32x4_t vk20c0 = vld1q_f32(w + 12); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c0, vget_low_f32(vi2x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c0, vget_low_f32(vi4x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk20c0, vget_high_f32(vi2x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c0, vget_high_f32(vi4x1), 1); const float32x4_t vk00c1 = vld1q_f32(w + 16); // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 ) const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4; vo0x0 = vmlaq_lane_f32(vo0x0, vk00c1, vget_high_f32(vi0x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c1, vget_high_f32(vi2x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk00c1, vget_low_f32(vi0x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c1, vget_low_f32(vi2x2), 0); const float32x4_t vk10c1 = vld1q_f32(w + 20); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0); const float32x4_t vk20c1 = vld1q_f32(w + 24); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c1, vget_high_f32(vi2x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c1, vget_high_f32(vi4x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk20c1, vget_low_f32(vi2x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c1, vget_low_f32(vi4x2), 0); const float32x4_t vk00c2 = vld1q_f32(w + 28); vo0x0 = vmlaq_lane_f32(vo0x0, vk00c2, vget_high_f32(vi0x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c2, vget_high_f32(vi2x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk00c2, vget_low_f32(vi0x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c2, vget_low_f32(vi2x2), 1); const float32x4_t vk10c2 = vld1q_f32(w + 32); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c2, vget_high_f32(vi1x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c2, vget_high_f32(vi3x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk10c2, vget_low_f32(vi1x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c2, vget_low_f32(vi3x2), 1); const float32x4_t vk20c2 = vld1q_f32(w + 36); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c2, vget_high_f32(vi2x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c2, vget_high_f32(vi4x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk20c2, vget_low_f32(vi2x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c2, vget_low_f32(vi4x2), 1); const float32x4_t vk01c0 = vld1q_f32(w + 40); vo0x0 = vmlaq_lane_f32(vo0x0, vk01c0, vget_low_f32(vi0x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c0, vget_low_f32(vi2x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk01c0, vget_high_f32(vi0x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c0, vget_high_f32(vi2x2), 0); const float32x4_t vk11c0 = vld1q_f32(w + 44); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c0, vget_low_f32(vi1x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c0, vget_low_f32(vi3x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk11c0, vget_high_f32(vi1x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c0, vget_high_f32(vi3x2), 0); const float32x4_t vk21c0 = vld1q_f32(w + 48); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c0, vget_low_f32(vi2x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c0, vget_low_f32(vi4x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk21c0, vget_high_f32(vi2x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c0, vget_high_f32(vi4x2), 0); const float32x4_t vk01c1 = vld1q_f32(w + 52); vo0x0 = vmlaq_lane_f32(vo0x0, vk01c1, vget_low_f32(vi0x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c1, vget_low_f32(vi2x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk01c1, vget_high_f32(vi0x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c1, vget_high_f32(vi2x2), 1); const float32x4_t vk11c1 = vld1q_f32(w + 56); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c1, vget_low_f32(vi1x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c1, vget_low_f32(vi3x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk11c1, vget_high_f32(vi1x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c1, vget_high_f32(vi3x2), 1); const float32x4_t vk21c1 = vld1q_f32(w + 60); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1); const float32x4_t vk01c2 = vld1q_f32(w + 64); // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 ) const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4; vo0x0 = vmlaq_lane_f32(vo0x0, vk01c2, vget_high_f32(vi0x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c2, vget_high_f32(vi2x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk01c2, vget_low_f32(vi0x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c2, vget_low_f32(vi2x3), 0); const float32x4_t vk11c2 = vld1q_f32(w + 68); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c2, vget_high_f32(vi1x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c2, vget_high_f32(vi3x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk11c2, vget_low_f32(vi1x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c2, vget_low_f32(vi3x3), 0); const float32x4_t vk21c2 = vld1q_f32(w + 72); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c2, vget_high_f32(vi2x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c2, vget_high_f32(vi4x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk21c2, vget_low_f32(vi2x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0); const float32x4_t vk02c0 = vld1q_f32(w + 76); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c0, vget_high_f32(vi0x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c0, vget_high_f32(vi2x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk02c0, vget_low_f32(vi0x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk02c0, vget_low_f32(vi2x3), 1); const float32x4_t vk12c0 = vld1q_f32(w + 80); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c0, vget_high_f32(vi1x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c0, vget_high_f32(vi3x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk12c0, vget_low_f32(vi1x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk12c0, vget_low_f32(vi3x3), 1); const float32x4_t vk22c0 = vld1q_f32(w + 84); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c0, vget_high_f32(vi2x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c0, vget_high_f32(vi4x1), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk22c0, vget_low_f32(vi2x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk22c0, vget_low_f32(vi4x3), 1); const float32x4_t vk02c1 = vld1q_f32(w + 88); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c1, vget_low_f32(vi0x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c1, vget_low_f32(vi2x2), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk02c1, vget_high_f32(vi0x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk02c1, vget_high_f32(vi2x3), 0); const float32x4_t vk12c1 = vld1q_f32(w + 92); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c1, vget_low_f32(vi1x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c1, vget_low_f32(vi3x2), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk12c1, vget_high_f32(vi1x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk12c1, vget_high_f32(vi3x3), 0); const float32x4_t vk22c1 = vld1q_f32(w + 96); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c1, vget_low_f32(vi2x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c1, vget_low_f32(vi4x2), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk22c1, vget_high_f32(vi2x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk22c1, vget_high_f32(vi4x3), 0); const float32x4_t vk02c2 = vld1q_f32(w + 100); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c2, vget_low_f32(vi0x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c2, vget_low_f32(vi2x2), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk02c2, vget_high_f32(vi0x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk02c2, vget_high_f32(vi2x3), 1); const float32x4_t vk12c2 = vld1q_f32(w + 104); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c2, vget_low_f32(vi1x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c2, vget_low_f32(vi3x2), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk12c2, vget_high_f32(vi1x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk12c2, vget_high_f32(vi3x3), 1); const float32x4_t vk22c2 = vld1q_f32(w + 108); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c2, vget_low_f32(vi2x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c2, vget_low_f32(vi4x2), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk22c2, vget_high_f32(vi2x3), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk22c2, vget_high_f32(vi4x3), 1); vi0x0 = vi0x3; vi1x0 = vi1x3; vi2x0 = vi2x3; vi3x0 = vi3x3; vi4x0 = vi4x3; vo0x0 = vmaxq_f32(vo0x0, vmin); vo1x0 = vmaxq_f32(vo1x0, vmin); vo0x1 = vmaxq_f32(vo0x1, vmin); vo1x1 = vmaxq_f32(vo1x1, vmin); vo0x0 = vminq_f32(vo0x0, vmax); vo1x0 = vminq_f32(vo1x0, vmax); vo0x1 = vminq_f32(vo0x1, vmax); vo1x1 = vminq_f32(vo1x1, vmax); const float32x4x2_t vo0c0123 = vzipq_f32(vo0x0, vo0x1); const float32x4x2_t vo1c0123 = vzipq_f32(vo1x0, vo1x1); // Always 2+ output width elements remaining vst1_f32(o1c0, vget_low_f32(vo1c0123.val[0])); o1c0 += 2; vst1_f32(o1c1, vget_high_f32(vo1c0123.val[0])); o1c1 += 2; vst1_f32(o1c2, vget_low_f32(vo1c0123.val[1])); o1c2 += 2; vst1_f32(o1c3, vget_high_f32(vo1c0123.val[1])); o1c3 += 2; vst1_f32(o0c0, vget_low_f32(vo0c0123.val[0])); o0c0 += 2; vst1_f32(o0c1, vget_high_f32(vo0c0123.val[0])); o0c1 += 2; vst1_f32(o0c2, vget_low_f32(vo0c0123.val[1])); o0c2 += 2; vst1_f32(o0c3, vget_high_f32(vo0c0123.val[1])); o0c3 += 2; } assert(iw < 4); if XNN_UNLIKELY(iw != 0) { float32x4_t vo0x0 = vld1q_f32(w); float32x4_t vo1x0 = vo0x0; float32x4_t vo0x1 = vo0x0; float32x4_t vo1x1 = vo0x0; const float32x4_t vk00c0 = vld1q_f32(w + 4); // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 ) float32x4_t vi0x1 = vld1q_f32(i0); float32x4_t vi1x1 = vld1q_f32(i1); float32x4_t vi2x1 = vld1q_f32(i2); float32x4_t vi3x1 = vld1q_f32(i3); float32x4_t vi4x1 = vld1q_f32(i4); vo0x0 = vmlaq_lane_f32(vo0x0, vk00c0, vget_low_f32(vi0x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c0, vget_low_f32(vi2x0), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk00c0, vget_high_f32(vi0x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c0, vget_high_f32(vi2x1), 1); } const float32x4_t vk10c0 = vld1q_f32(w + 8); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c0, vget_low_f32(vi1x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c0, vget_low_f32(vi3x0), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk10c0, vget_high_f32(vi1x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c0, vget_high_f32(vi3x1), 1); } const float32x4_t vk20c0 = vld1q_f32(w + 12); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c0, vget_low_f32(vi2x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c0, vget_low_f32(vi4x0), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk20c0, vget_high_f32(vi2x1), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c0, vget_high_f32(vi4x1), 1); } const float32x4_t vk00c1 = vld1q_f32(w + 16); float32x4_t vi0x2 = vmovq_n_f32(0.0f); float32x4_t vi1x2 = vmovq_n_f32(0.0f); float32x4_t vi2x2 = vmovq_n_f32(0.0f); float32x4_t vi3x2 = vmovq_n_f32(0.0f); float32x4_t vi4x2 = vmovq_n_f32(0.0f); if (iw >= 2) { // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 ) vi0x2 = vld1q_f32(i0 + 4); vi1x2 = vld1q_f32(i1 + 4); vi2x2 = vld1q_f32(i2 + 4); vi3x2 = vld1q_f32(i3 + 4); vi4x2 = vld1q_f32(i4 + 4); } vo0x0 = vmlaq_lane_f32(vo0x0, vk00c1, vget_high_f32(vi0x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c1, vget_high_f32(vi2x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk00c1, vget_low_f32(vi0x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c1, vget_low_f32(vi2x2), 0); const float32x4_t vk10c1 = vld1q_f32(w + 20); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0); const float32x4_t vk20c1 = vld1q_f32(w + 24); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c1, vget_high_f32(vi2x0), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c1, vget_high_f32(vi4x0), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk20c1, vget_low_f32(vi2x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c1, vget_low_f32(vi4x2), 0); const float32x4_t vk00c2 = vld1q_f32(w + 28); vo0x0 = vmlaq_lane_f32(vo0x0, vk00c2, vget_high_f32(vi0x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk00c2, vget_high_f32(vi2x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk00c2, vget_low_f32(vi0x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk00c2, vget_low_f32(vi2x2), 1); const float32x4_t vk10c2 = vld1q_f32(w + 32); vo0x0 = vmlaq_lane_f32(vo0x0, vk10c2, vget_high_f32(vi1x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk10c2, vget_high_f32(vi3x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk10c2, vget_low_f32(vi1x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk10c2, vget_low_f32(vi3x2), 1); const float32x4_t vk20c2 = vld1q_f32(w + 36); vo0x0 = vmlaq_lane_f32(vo0x0, vk20c2, vget_high_f32(vi2x0), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk20c2, vget_high_f32(vi4x0), 1); vo0x1 = vmlaq_lane_f32(vo0x1, vk20c2, vget_low_f32(vi2x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk20c2, vget_low_f32(vi4x2), 1); const float32x4_t vk01c0 = vld1q_f32(w + 40); vo0x0 = vmlaq_lane_f32(vo0x0, vk01c0, vget_low_f32(vi0x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c0, vget_low_f32(vi2x1), 0); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk01c0, vget_high_f32(vi0x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c0, vget_high_f32(vi2x2), 0); } const float32x4_t vk11c0 = vld1q_f32(w + 44); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c0, vget_low_f32(vi1x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c0, vget_low_f32(vi3x1), 0); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk11c0, vget_high_f32(vi1x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c0, vget_high_f32(vi3x2), 0); } const float32x4_t vk21c0 = vld1q_f32(w + 48); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c0, vget_low_f32(vi2x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c0, vget_low_f32(vi4x1), 0); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk21c0, vget_high_f32(vi2x2), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c0, vget_high_f32(vi4x2), 0); } const float32x4_t vk01c1 = vld1q_f32(w + 52); vo0x0 = vmlaq_lane_f32(vo0x0, vk01c1, vget_low_f32(vi0x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c1, vget_low_f32(vi2x1), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk01c1, vget_high_f32(vi0x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c1, vget_high_f32(vi2x2), 1); } const float32x4_t vk11c1 = vld1q_f32(w + 56); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c1, vget_low_f32(vi1x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c1, vget_low_f32(vi3x1), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk11c1, vget_high_f32(vi1x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c1, vget_high_f32(vi3x2), 1); } const float32x4_t vk21c1 = vld1q_f32(w + 60); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1); if (iw > 2) { vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1); } const float32x4_t vk01c2 = vld1q_f32(w + 64); float32x4_t vi0x3 = vmovq_n_f32(0.0f); float32x4_t vi1x3 = vmovq_n_f32(0.0f); float32x4_t vi2x3 = vmovq_n_f32(0.0f); float32x4_t vi3x3 = vmovq_n_f32(0.0f); float32x4_t vi4x3 = vmovq_n_f32(0.0f); if (iw > 2) { // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 ) vi0x3 = vld1q_lane_f32(i0 + 8, vi0x3, 0); vi1x3 = vld1q_lane_f32(i1 + 8, vi1x3, 0); vi2x3 = vld1q_lane_f32(i2 + 8, vi2x3, 0); vi3x3 = vld1q_lane_f32(i3 + 8, vi3x3, 0); vi4x3 = vld1q_lane_f32(i4 + 8, vi4x3, 0); } vo0x0 = vmlaq_lane_f32(vo0x0, vk01c2, vget_high_f32(vi0x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk01c2, vget_high_f32(vi2x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk01c2, vget_low_f32(vi0x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk01c2, vget_low_f32(vi2x3), 0); const float32x4_t vk11c2 = vld1q_f32(w + 68); vo0x0 = vmlaq_lane_f32(vo0x0, vk11c2, vget_high_f32(vi1x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk11c2, vget_high_f32(vi3x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk11c2, vget_low_f32(vi1x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk11c2, vget_low_f32(vi3x3), 0); const float32x4_t vk21c2 = vld1q_f32(w + 72); vo0x0 = vmlaq_lane_f32(vo0x0, vk21c2, vget_high_f32(vi2x1), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk21c2, vget_high_f32(vi4x1), 0); vo0x1 = vmlaq_lane_f32(vo0x1, vk21c2, vget_low_f32(vi2x3), 0); vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0); if (iw >= 2) { const float32x4_t vk02c0 = vld1q_f32(w + 76); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c0, vget_high_f32(vi0x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c0, vget_high_f32(vi2x1), 1); const float32x4_t vk12c0 = vld1q_f32(w + 80); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c0, vget_high_f32(vi1x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c0, vget_high_f32(vi3x1), 1); const float32x4_t vk22c0 = vld1q_f32(w + 84); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c0, vget_high_f32(vi2x1), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c0, vget_high_f32(vi4x1), 1); const float32x4_t vk02c1 = vld1q_f32(w + 88); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c1, vget_low_f32(vi0x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c1, vget_low_f32(vi2x2), 0); const float32x4_t vk12c1 = vld1q_f32(w + 92); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c1, vget_low_f32(vi1x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c1, vget_low_f32(vi3x2), 0); const float32x4_t vk22c1 = vld1q_f32(w + 96); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c1, vget_low_f32(vi2x2), 0); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c1, vget_low_f32(vi4x2), 0); const float32x4_t vk02c2 = vld1q_f32(w + 100); vo0x0 = vmlaq_lane_f32(vo0x0, vk02c2, vget_low_f32(vi0x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk02c2, vget_low_f32(vi2x2), 1); const float32x4_t vk12c2 = vld1q_f32(w + 104); vo0x0 = vmlaq_lane_f32(vo0x0, vk12c2, vget_low_f32(vi1x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk12c2, vget_low_f32(vi3x2), 1); const float32x4_t vk22c2 = vld1q_f32(w + 108); vo0x0 = vmlaq_lane_f32(vo0x0, vk22c2, vget_low_f32(vi2x2), 1); vo1x0 = vmlaq_lane_f32(vo1x0, vk22c2, vget_low_f32(vi4x2), 1); } vo0x0 = vmaxq_f32(vo0x0, vmin); vo1x0 = vmaxq_f32(vo1x0, vmin); vo0x1 = vmaxq_f32(vo0x1, vmin); vo1x1 = vmaxq_f32(vo1x1, vmin); vo0x0 = vminq_f32(vo0x0, vmax); vo1x0 = vminq_f32(vo1x0, vmax); vo0x1 = vminq_f32(vo0x1, vmax); vo1x1 = vminq_f32(vo1x1, vmax); if (iw == 3) { // Exactly 2 output width elements remaining const float32x4x2_t vo0c0123 = vzipq_f32(vo0x0, vo0x1); const float32x4x2_t vo1c0123 = vzipq_f32(vo1x0, vo1x1); vst1_f32(o1c0, vget_low_f32(vo1c0123.val[0])); o1c0 += 2; vst1_f32(o1c1, vget_high_f32(vo1c0123.val[0])); o1c1 += 2; vst1_f32(o1c2, vget_low_f32(vo1c0123.val[1])); o1c2 += 2; vst1_f32(o1c3, vget_high_f32(vo1c0123.val[1])); o1c3 += 2; vst1_f32(o0c0, vget_low_f32(vo0c0123.val[0])); o0c0 += 2; vst1_f32(o0c1, vget_high_f32(vo0c0123.val[0])); o0c1 += 2; vst1_f32(o0c2, vget_low_f32(vo0c0123.val[1])); o0c2 += 2; vst1_f32(o0c3, vget_high_f32(vo0c0123.val[1])); o0c3 += 2; } else { // Exactly 1 output width element remaining vst1q_lane_f32(o1c0, vo1x0, 0); o1c0 += 1; vst1q_lane_f32(o1c1, vo1x0, 1); o1c1 += 1; vst1q_lane_f32(o1c2, vo1x0, 2); o1c2 += 1; vst1q_lane_f32(o1c3, vo1x0, 3); o1c3 += 1; vst1q_lane_f32(o0c0, vo0x0, 0); o0c0 += 1; vst1q_lane_f32(o0c1, vo0x0, 1); o0c1 += 1; vst1q_lane_f32(o0c2, vo0x0, 2); o0c2 += 1; vst1q_lane_f32(o0c3, vo0x0, 3); o0c3 += 1; } } // Move output pointers back to the position of the first pixel in a row, // and forward to the next block of output channels. o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment); o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment); o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment); o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment); o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment); o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment); o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment); o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment); // Revert input pointers to the position of the first pixel in a row i0 = (const float*) ((uintptr_t) i0 - input_width_increment); i1 = (const float*) ((uintptr_t) i1 - input_width_increment); i2 = (const float*) ((uintptr_t) i2 - input_width_increment); i3 = (const float*) ((uintptr_t) i3 - input_width_increment); i4 = (const float*) ((uintptr_t) i4 - input_width_increment); // Move to the block of weights for the next 4 output channels w += 112; c = doz(c, 4); } while (c != 0); // Move output pointers forward to the next two rows output0 = (float*) ((uintptr_t) output1 + output_height_stride); output1 = (float*) ((uintptr_t) output0 + output_height_stride); // Move input pointers forward to the next four rows i0 = i4; i1 = (const float*) ((uintptr_t) i0 + input_height_stride); i2 = (const float*) ((uintptr_t) i1 + input_height_stride); i3 = (const float*) ((uintptr_t) i2 + input_height_stride); i4 = (const float*) ((uintptr_t) i3 + input_height_stride); } } void xnn_f32_dwconv_minmax_ukernel_25p8c__neon_acc2( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const float* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const float*) ((uintptr_t) i9 + input_offset); } const float* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const float*) ((uintptr_t) i10 + input_offset); } const float* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const float*) ((uintptr_t) i11 + input_offset); } const float* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const float*) ((uintptr_t) i12 + input_offset); } const float* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const float*) ((uintptr_t) i13 + input_offset); } const float* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const float*) ((uintptr_t) i14 + input_offset); } const float* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const float*) ((uintptr_t) i15 + input_offset); } const float* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const float*) ((uintptr_t) i16 + input_offset); } const float* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const float*) ((uintptr_t) i17 + input_offset); } const float* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const float*) ((uintptr_t) i18 + input_offset); } const float* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const float*) ((uintptr_t) i19 + input_offset); } const float* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const float*) ((uintptr_t) i20 + input_offset); } const float* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const float*) ((uintptr_t) i21 + input_offset); } const float* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const float*) ((uintptr_t) i22 + input_offset); } const float* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const float*) ((uintptr_t) i23 + input_offset); } const float* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const float*) ((uintptr_t) i24 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi5x4567, vk5x4567); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi7x4567, vk7x4567); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567); const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; const float32x4_t vi9x4567 = vld1q_f32(i9); i9 += 4; const float32x4_t vk9x0123 = vld1q_f32(w); w += 4; const float32x4_t vk9x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi9x4567, vk9x4567); const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; const float32x4_t vi10x4567 = vld1q_f32(i10); i10 += 4; const float32x4_t vk10x0123 = vld1q_f32(w); w += 4; const float32x4_t vk10x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi10x4567, vk10x4567); const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; const float32x4_t vi11x4567 = vld1q_f32(i11); i11 += 4; const float32x4_t vk11x0123 = vld1q_f32(w); w += 4; const float32x4_t vk11x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi11x4567, vk11x4567); const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; const float32x4_t vi12x4567 = vld1q_f32(i12); i12 += 4; const float32x4_t vk12x0123 = vld1q_f32(w); w += 4; const float32x4_t vk12x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi12x4567, vk12x4567); const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; const float32x4_t vi13x4567 = vld1q_f32(i13); i13 += 4; const float32x4_t vk13x0123 = vld1q_f32(w); w += 4; const float32x4_t vk13x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi13x4567, vk13x4567); const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; const float32x4_t vi14x4567 = vld1q_f32(i14); i14 += 4; const float32x4_t vk14x0123 = vld1q_f32(w); w += 4; const float32x4_t vk14x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi14x4567, vk14x4567); const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; const float32x4_t vi15x4567 = vld1q_f32(i15); i15 += 4; const float32x4_t vk15x0123 = vld1q_f32(w); w += 4; const float32x4_t vk15x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi15x4567, vk15x4567); const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; const float32x4_t vi16x4567 = vld1q_f32(i16); i16 += 4; const float32x4_t vk16x0123 = vld1q_f32(w); w += 4; const float32x4_t vk16x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi16x4567, vk16x4567); const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; const float32x4_t vi17x4567 = vld1q_f32(i17); i17 += 4; const float32x4_t vk17x0123 = vld1q_f32(w); w += 4; const float32x4_t vk17x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi17x4567, vk17x4567); const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; const float32x4_t vi18x4567 = vld1q_f32(i18); i18 += 4; const float32x4_t vk18x0123 = vld1q_f32(w); w += 4; const float32x4_t vk18x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi18x4567, vk18x4567); const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; const float32x4_t vi19x4567 = vld1q_f32(i19); i19 += 4; const float32x4_t vk19x0123 = vld1q_f32(w); w += 4; const float32x4_t vk19x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi19x4567, vk19x4567); const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; const float32x4_t vi20x4567 = vld1q_f32(i20); i20 += 4; const float32x4_t vk20x0123 = vld1q_f32(w); w += 4; const float32x4_t vk20x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi20x4567, vk20x4567); const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; const float32x4_t vi21x4567 = vld1q_f32(i21); i21 += 4; const float32x4_t vk21x0123 = vld1q_f32(w); w += 4; const float32x4_t vk21x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi21x4567, vk21x4567); const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; const float32x4_t vi22x4567 = vld1q_f32(i22); i22 += 4; const float32x4_t vk22x0123 = vld1q_f32(w); w += 4; const float32x4_t vk22x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi22x4567, vk22x4567); const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; const float32x4_t vi23x4567 = vld1q_f32(i23); i23 += 4; const float32x4_t vk23x0123 = vld1q_f32(w); w += 4; const float32x4_t vk23x4567 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123); vacc4567p1 = vmlaq_f32(vacc4567p1, vi23x4567, vk23x4567); const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; const float32x4_t vi24x4567 = vld1q_f32(i24); i24 += 4; const float32x4_t vk24x0123 = vld1q_f32(w); w += 4; const float32x4_t vk24x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi24x4567, vk24x4567); // Add up all accumulators to vacc01234567p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w + 36); vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w + 44); vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w + 52); vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w + 60); vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w + 68); vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); const float32x4_t vi9x0123 = vld1q_f32(i9); i9 += 4; const float32x4_t vk9x0123 = vld1q_f32(w + 76); vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123); const float32x4_t vi10x0123 = vld1q_f32(i10); i10 += 4; const float32x4_t vk10x0123 = vld1q_f32(w + 84); vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123); const float32x4_t vi11x0123 = vld1q_f32(i11); i11 += 4; const float32x4_t vk11x0123 = vld1q_f32(w + 92); vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123); const float32x4_t vi12x0123 = vld1q_f32(i12); i12 += 4; const float32x4_t vk12x0123 = vld1q_f32(w + 100); vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123); const float32x4_t vi13x0123 = vld1q_f32(i13); i13 += 4; const float32x4_t vk13x0123 = vld1q_f32(w + 108); vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123); const float32x4_t vi14x0123 = vld1q_f32(i14); i14 += 4; const float32x4_t vk14x0123 = vld1q_f32(w + 116); vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123); const float32x4_t vi15x0123 = vld1q_f32(i15); i15 += 4; const float32x4_t vk15x0123 = vld1q_f32(w + 124); vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123); const float32x4_t vi16x0123 = vld1q_f32(i16); i16 += 4; const float32x4_t vk16x0123 = vld1q_f32(w + 132); vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123); const float32x4_t vi17x0123 = vld1q_f32(i17); i17 += 4; const float32x4_t vk17x0123 = vld1q_f32(w + 140); vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123); const float32x4_t vi18x0123 = vld1q_f32(i18); i18 += 4; const float32x4_t vk18x0123 = vld1q_f32(w + 148); vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123); const float32x4_t vi19x0123 = vld1q_f32(i19); i19 += 4; const float32x4_t vk19x0123 = vld1q_f32(w + 156); vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123); const float32x4_t vi20x0123 = vld1q_f32(i20); i20 += 4; const float32x4_t vk20x0123 = vld1q_f32(w + 164); vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123); const float32x4_t vi21x0123 = vld1q_f32(i21); i21 += 4; const float32x4_t vk21x0123 = vld1q_f32(w + 172); vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123); const float32x4_t vi22x0123 = vld1q_f32(i22); i22 += 4; const float32x4_t vk22x0123 = vld1q_f32(w + 180); vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123); const float32x4_t vi23x0123 = vld1q_f32(i23); i23 += 4; const float32x4_t vk23x0123 = vld1q_f32(w + 188); vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123); const float32x4_t vi24x0123 = vld1q_f32(i24); i24 += 4; const float32x4_t vk24x0123 = vld1q_f32(w + 196); vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); const float32x4_t vk4x0123 = vld1q_f32(w + 40); vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); const float32x4_t vk5x0123 = vld1q_f32(w + 48); vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); const float32x4_t vk6x0123 = vld1q_f32(w + 56); vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); const float32x4_t vk7x0123 = vld1q_f32(w + 64); vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); const float32x4_t vk8x0123 = vld1q_f32(w + 72); vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); const float32x4_t vi9x0123 = vld1q_f32(i9); const float32x4_t vk9x0123 = vld1q_f32(w + 80); vacc0123p1 = vmlaq_f32(vacc0123p1, vi9x0123, vk9x0123); const float32x4_t vi10x0123 = vld1q_f32(i10); const float32x4_t vk10x0123 = vld1q_f32(w + 88); vacc0123p0 = vmlaq_f32(vacc0123p0, vi10x0123, vk10x0123); const float32x4_t vi11x0123 = vld1q_f32(i11); const float32x4_t vk11x0123 = vld1q_f32(w + 96); vacc0123p1 = vmlaq_f32(vacc0123p1, vi11x0123, vk11x0123); const float32x4_t vi12x0123 = vld1q_f32(i12); const float32x4_t vk12x0123 = vld1q_f32(w + 104); vacc0123p0 = vmlaq_f32(vacc0123p0, vi12x0123, vk12x0123); const float32x4_t vi13x0123 = vld1q_f32(i13); const float32x4_t vk13x0123 = vld1q_f32(w + 112); vacc0123p1 = vmlaq_f32(vacc0123p1, vi13x0123, vk13x0123); const float32x4_t vi14x0123 = vld1q_f32(i14); const float32x4_t vk14x0123 = vld1q_f32(w + 120); vacc0123p0 = vmlaq_f32(vacc0123p0, vi14x0123, vk14x0123); const float32x4_t vi15x0123 = vld1q_f32(i15); const float32x4_t vk15x0123 = vld1q_f32(w + 128); vacc0123p1 = vmlaq_f32(vacc0123p1, vi15x0123, vk15x0123); const float32x4_t vi16x0123 = vld1q_f32(i16); const float32x4_t vk16x0123 = vld1q_f32(w + 136); vacc0123p0 = vmlaq_f32(vacc0123p0, vi16x0123, vk16x0123); const float32x4_t vi17x0123 = vld1q_f32(i17); const float32x4_t vk17x0123 = vld1q_f32(w + 144); vacc0123p1 = vmlaq_f32(vacc0123p1, vi17x0123, vk17x0123); const float32x4_t vi18x0123 = vld1q_f32(i18); const float32x4_t vk18x0123 = vld1q_f32(w + 152); vacc0123p0 = vmlaq_f32(vacc0123p0, vi18x0123, vk18x0123); const float32x4_t vi19x0123 = vld1q_f32(i19); const float32x4_t vk19x0123 = vld1q_f32(w + 160); vacc0123p1 = vmlaq_f32(vacc0123p1, vi19x0123, vk19x0123); const float32x4_t vi20x0123 = vld1q_f32(i20); const float32x4_t vk20x0123 = vld1q_f32(w + 168); vacc0123p0 = vmlaq_f32(vacc0123p0, vi20x0123, vk20x0123); const float32x4_t vi21x0123 = vld1q_f32(i21); const float32x4_t vk21x0123 = vld1q_f32(w + 176); vacc0123p1 = vmlaq_f32(vacc0123p1, vi21x0123, vk21x0123); const float32x4_t vi22x0123 = vld1q_f32(i22); const float32x4_t vk22x0123 = vld1q_f32(w + 184); vacc0123p0 = vmlaq_f32(vacc0123p0, vi22x0123, vk22x0123); const float32x4_t vi23x0123 = vld1q_f32(i23); const float32x4_t vk23x0123 = vld1q_f32(w + 192); vacc0123p1 = vmlaq_f32(vacc0123p1, vi23x0123, vk23x0123); const float32x4_t vi24x0123 = vld1q_f32(i24); const float32x4_t vk24x0123 = vld1q_f32(w + 200); vacc0123p0 = vmlaq_f32(vacc0123p0, vi24x0123, vk24x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3p8c__neon( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_4p8c__neon( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi3x4567, vk3x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_8f8m9l4c4s4r__neon_acc2( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 8); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* w = weights; // First pass to process 8 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } input += 8; // Process c channels and write to buffer. size_t c = 0; for (; c < channels; c += 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vst1q_f32(b, vacc0123p0); b += 4; } } // Middle pass to process 8 inputs in each iteration. for (size_t ks = kernel_size - 8; ks > 9; ks -= 8) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } input += 8; size_t c = 0; for (; c < channels; c += 4) { float32x4_t vacc0123p0 = vld1q_f32(b); const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); vst1q_f32(b, vacc0123p0); b += 4; } } // Last pass to process up to 9 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } size_t c = channels; for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(b); b += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; float32x4_t vk5x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; float32x4_t vk6x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; float32x4_t vk7x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; float32x4_t vk8x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(b); const float32x4_t vi0x0123 = vld1q_f32(i0); float32x4_t vk0x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); float32x4_t vk1x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); float32x4_t vk2x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); float32x4_t vk3x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); float32x4_t vk4x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); float32x4_t vk5x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); float32x4_t vk6x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); float32x4_t vk7x0123 = vld1q_f32(w); w += 4; vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); float32x4_t vk8x0123 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); // Add up all accumulators to vacc0123p0 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_9p8c__neon( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; float32x4_t vacc4567p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w); w += 4; const float32x4_t vk0x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w); w += 4; const float32x4_t vk1x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w); w += 4; const float32x4_t vk2x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w); w += 4; const float32x4_t vk3x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi3x4567, vk3x4567); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w); w += 4; const float32x4_t vk4x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w); w += 4; const float32x4_t vk5x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi5x4567, vk5x4567); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w); w += 4; const float32x4_t vk6x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w); w += 4; const float32x4_t vk7x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi7x4567, vk7x4567); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; const float32x4_t vk8x4567 = vld1q_f32(w); w += 4; vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vacc4567 = vminq_f32(vacc4567, vmax); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; c >= 4; c -= 4) { float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vk0x0123 = vld1q_f32(w + 4); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vk1x0123 = vld1q_f32(w + 12); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vk2x0123 = vld1q_f32(w + 20); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vk3x0123 = vld1q_f32(w + 28); vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4; const float32x4_t vk4x0123 = vld1q_f32(w + 36); vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4; const float32x4_t vk5x0123 = vld1q_f32(w + 44); vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4; const float32x4_t vk6x0123 = vld1q_f32(w + 52); vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4; const float32x4_t vk7x0123 = vld1q_f32(w + 60); vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4; const float32x4_t vk8x0123 = vld1q_f32(w + 68); vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); vst1q_f32(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { float32x4_t vacc0123p0 = vld1q_f32(w); const float32x4_t vi0x0123 = vld1q_f32(i0); const float32x4_t vk0x0123 = vld1q_f32(w + 8); vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); const float32x4_t vi1x0123 = vld1q_f32(i1); const float32x4_t vk1x0123 = vld1q_f32(w + 16); vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); const float32x4_t vi2x0123 = vld1q_f32(i2); const float32x4_t vk2x0123 = vld1q_f32(w + 24); vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); const float32x4_t vi3x0123 = vld1q_f32(i3); const float32x4_t vk3x0123 = vld1q_f32(w + 32); vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); const float32x4_t vi4x0123 = vld1q_f32(i4); const float32x4_t vk4x0123 = vld1q_f32(w + 40); vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); const float32x4_t vi5x0123 = vld1q_f32(i5); const float32x4_t vk5x0123 = vld1q_f32(w + 48); vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123); const float32x4_t vi6x0123 = vld1q_f32(i6); const float32x4_t vk6x0123 = vld1q_f32(w + 56); vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); const float32x4_t vi7x0123 = vld1q_f32(i7); const float32x4_t vk7x0123 = vld1q_f32(w + 64); vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123); const float32x4_t vi8x0123 = vld1q_f32(i8); const float32x4_t vk8x0123 = vld1q_f32(w + 72); vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); vacc0123 = vminq_f32(vacc0123, vmax); float32x2_t vacc01 = vget_low_f32(vacc0123); if (c & 2) { vst1_f32(output, vacc01); output += 2; vacc01 = vget_high_f32(vacc0123); } if (c & 1) { vst1_lane_f32(output, vacc01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 1); const uint32x4_t vmask = vld1q_u32(params->neon_stride1.mask); const float32x4_t vmax = vld1q_dup_f32(¶ms->neon_stride1.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->neon_stride1.min); const float32x4_t vw0123 = vld1q_f32(weights); const float32x4_t vw4567 = vld1q_f32(weights + 4); const float32x2_t vw89 = vld1_f32(weights + 8); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = input; const float* i2 = (const float*) ((uintptr_t) i1 + input_width); const float* i3 = (const float*) ((uintptr_t) i2 + input_width); float* o0 = output; float* o1 = (float*) ((uintptr_t) o0 + input_width); size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i2 = zero; o1 = o0; } if XNN_UNPREDICTABLE(output_height < 3) { i3 = zero; } float32x4_t vi0x0123 = vmovq_n_f32(0.0f); float32x4_t vi1x0123 = vmovq_n_f32(0.0f); float32x4_t vi2x0123 = vmovq_n_f32(0.0f); float32x4_t vi3x0123 = vmovq_n_f32(0.0f); float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; size_t w = input_width; for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4; const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4; const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3); const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1); vi0x0123 = vi0x4567; vi1x0123 = vi1x4567; vi2x0123 = vi2x4567; vi3x0123 = vi3x4567; const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1); const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1); const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); vi0x4567 = vi0x89AB; vi1x4567 = vi1x89AB; vi2x4567 = vi2x89AB; vi3x4567 = vi3x89AB; float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); float32x4_t vo1 = vmaxq_f32(vo1p0, vmin); vo0 = vminq_f32(vo0, vmax); vo1 = vminq_f32(vo1, vmax); vst1q_f32(o1, vo1); o1 += 4; vst1q_f32(o0, vo0); o0 += 4; } // Always process the last block of 1..4 pixels. assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567))); vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567))); vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567))); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3); const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_low_f32(vw0123), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_low_f32(vw4567), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_high_f32(vw4567), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_high_f32(vw4567), 1); const float32x4_t vzero = vmovq_n_f32(0.0f); const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1); const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1); const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_high_f32(vw0123), 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); vo1p0 = vmlaq_lane_f32(vo1p0, vi2x5678, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); float32x4_t vo1 = vmaxq_f32(vo1p0, vmin); vo0 = vminq_f32(vo0, vmax); vo1 = vminq_f32(vo1, vmax); if XNN_LIKELY(w == 4 * sizeof(float)) { vst1q_f32(o1, vo1); o1 += 4; vst1q_f32(o0, vo0); o0 += 4; } else { float32x2_t vo0_lo = vget_low_f32(vo0); float32x2_t vo1_lo = vget_low_f32(vo1); if (w & (2 * sizeof(float))) { vst1_f32(o1, vo1_lo); o1 += 2; vst1_f32(o0, vo0_lo); o0 += 2; vo0_lo = vget_high_f32(vo0); vo1_lo = vget_high_f32(vo1); } if (w & (1 * sizeof(float))) { vst1_lane_f32(o1, vo1_lo, 0); o1 += 1; vst1_lane_f32(o0, vo0_lo, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); o0 = o1; o1 = (float*) ((uintptr_t) o0 + input_width); output_height = doz(output_height, 2); } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 0); assert(padding_top <= 1); const uint32x4_t vmask_even = vld1q_u32(params->neon_stride2.mask_even); const uint32x4_t vmask_odd = vld1q_u32(params->neon_stride2.mask_odd); const float32x4_t vmax = vld1q_dup_f32(¶ms->neon_stride2.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->neon_stride2.min); const float32x4_t vw0123 = vld1q_f32(weights); const float32x4_t vw4567 = vld1q_f32(weights + 4); const float32x2_t vw89 = vld1_f32(weights + 8); const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); const float* i1 = (const float*) ((uintptr_t) i0 + input_width); if XNN_UNPREDICTABLE(padding_top != 0) { i0 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_width); float* o0 = output; size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 4) { i2 = zero; } float32x4_t vi0x1357 = vmovq_n_f32(0.0f); float32x4_t vi1x1357 = vmovq_n_f32(0.0f); float32x4_t vi2x1357 = vmovq_n_f32(0.0f); size_t w = input_width; for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); i0 += 8; const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); i1 += 8; const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); i2 += 8; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vw89, 0); const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x8ACE9BDF.val[1], 3); vi0x1357 = vi0x8ACE9BDF.val[1]; const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x8ACE9BDF.val[1], 3); vi1x1357 = vi1x8ACE9BDF.val[1]; const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x8ACE9BDF.val[1], 3); vi2x1357 = vi2x8ACE9BDF.val[1]; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vw89, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); vst1q_f32(o0, vo0); o0 += 4; } // Last block has 0-7 pixels to process. assert(w < 8 * sizeof(float)); if XNN_LIKELY(w != 0) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); const float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); const float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); const float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); const float32x4_t vi0x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[0]))); const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[1]))); const float32x4_t vi1x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[0]))); const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[1]))); const float32x4_t vi2x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[0]))); const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[1]))); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE, vw89, 0); const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x9BDF, 3); const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x9BDF, 3); const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x9BDF, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x9BDF, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x9BDF, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x9BDF, vw89, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); w += 1 * sizeof(float); if (w & (8 * sizeof(float))) { vst1q_f32(o0, vo0); o0 += 4; } else { float32x2_t vo0_lo = vget_low_f32(vo0); if (w & (4 * sizeof(float))) { vst1_f32(o0, vo0_lo); o0 += 2; vo0_lo = vget_high_f32(vo0); } if (w & (2 * sizeof(float))) { vst1_lane_f32(o0, vo0_lo, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i0 + input_width); i2 = (const float*) ((uintptr_t) i1 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 2); const uint32x4_t vmask = vld1q_u32(params->neon_stride1.mask); const float32x4_t vmax = vld1q_dup_f32(¶ms->neon_stride1.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->neon_stride1.min); const float32x4_t vw0123 = vld1q_f32(weights); const float32x4_t vw4567 = vld1q_f32(weights + 4); const float32x4_t vw89AB = vld1q_f32(weights + 8); const float32x4_t vwCDEF = vld1q_f32(weights + 12); const float32x4_t vwGHIJ = vld1q_f32(weights + 16); const float32x4_t vwKLMN = vld1q_f32(weights + 20); const float32x2_t vwOP = vld1_f32(weights + 24); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = zero; const float* i2 = input; const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); float* o0 = output; size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i3 = zero; } if XNN_UNPREDICTABLE(output_height < 3) { i4 = zero; } float32x4_t vi0x0123 = vmovq_n_f32(0.0f); float32x4_t vi1x0123 = vmovq_n_f32(0.0f); float32x4_t vi2x0123 = vmovq_n_f32(0.0f); float32x4_t vi3x0123 = vmovq_n_f32(0.0f); float32x4_t vi4x0123 = vmovq_n_f32(0.0f); float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; size_t w = input_width; for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4; const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4; const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4; const float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3); const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2); vi0x0123 = vi0x4567; const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2); vi1x0123 = vi1x4567; const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2); vi2x0123 = vi2x4567; const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); vi3x0123 = vi3x4567; const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); vi4x0123 = vi4x4567; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1); const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1); const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1); const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2); vi0x4567 = vi0x89AB; const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2); vi1x4567 = vi1x89AB; const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2); vi2x4567 = vi2x89AB; const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); vi3x4567 = vi3x89AB; const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); vi4x4567 = vi4x89AB; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); vst1q_f32(o0, vo0); o0 += 4; } // Always process the last block of 5..8 pixels. if XNN_LIKELY(w > 4 * sizeof(float)) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4; float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4; float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4; float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4; float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4; vi0x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x89AB))); vi1x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x89AB))); vi2x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x89AB))); vi3x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x89AB))); vi4x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x89AB))); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3); const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2); vi0x0123 = vi0x4567; const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2); vi1x0123 = vi1x4567; const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2); vi2x0123 = vi2x4567; const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); vi3x0123 = vi3x4567; const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); vi4x0123 = vi4x4567; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1); const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1); const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1); const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2); vi0x4567 = vi0x89AB; const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2); vi1x4567 = vi1x89AB; const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2); vi2x4567 = vi2x89AB; const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); vi3x4567 = vi3x89AB; const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); vi4x4567 = vi4x89AB; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); vst1q_f32(o0, vo0); o0 += 4; w -= 4 * sizeof(float); } assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567))); vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567))); vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567))); vi4x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x4567))); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3); const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2); const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2); const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2); const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x2345, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x2345, vget_low_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1); const float32x4_t vzero = vmovq_n_f32(0.0f); const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1); const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1); const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1); const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x5678, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vget_high_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); const float32x4_t vi0x6789 = vextq_f32(vi0x5678, vzero, 1); const float32x4_t vi1x6789 = vextq_f32(vi1x5678, vzero, 1); const float32x4_t vi2x6789 = vextq_f32(vi2x5678, vzero, 1); const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x6789, vget_high_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x6789, vwOP, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); if XNN_LIKELY(w & (4 * sizeof(float))) { vst1q_f32(o0, vo0); o0 += 4; } else { float32x2_t vo0_lo = vget_low_f32(vo0); if (w & (2 * sizeof(float))) { vst1_f32(o0, vo0_lo); o0 += 2; vo0_lo = vget_high_f32(vo0); } if (w & (1 * sizeof(float))) { vst1_lane_f32(o0, vo0_lo, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i1 - input_decrement); i1 = (const float*) ((uintptr_t) i2 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); } while (--output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 1); assert(padding_top <= 2); const uint32x4_t vmask_even = vld1q_u32(params->neon_stride2.mask_even); const uint32x4_t vmask_odd = vld1q_u32(params->neon_stride2.mask_odd); const float32x4_t vmax = vld1q_dup_f32(¶ms->neon_stride2.max); const float32x4_t vmin = vld1q_dup_f32(¶ms->neon_stride2.min); const float32x4_t vw0123 = vld1q_f32(weights); const float32x4_t vw4567 = vld1q_f32(weights + 4); const float32x4_t vw89AB = vld1q_f32(weights + 8); const float32x4_t vwCDEF = vld1q_f32(weights + 12); const float32x4_t vwGHIJ = vld1q_f32(weights + 16); const float32x4_t vwKLMN = vld1q_f32(weights + 20); const float32x2_t vwOP = vld1_f32(weights + 24); const uint32_t padding_top_less_1 = padding_top - 1; const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float)); const float* i0 = zero; const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); const float* i2 = (const float*) ((uintptr_t) i1 + input_width); if XNN_UNPREDICTABLE(padding_top_less_1 != 0) { i1 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); float* o0 = output; size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */; size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 6) { i3 = zero; } if XNN_UNPREDICTABLE(padded_input_height < 7) { i4 = zero; } float32x4_t vi0x0246 = vmovq_n_f32(0.0f); float32x4_t vi1x0246 = vmovq_n_f32(0.0f); float32x4_t vi2x0246 = vmovq_n_f32(0.0f); float32x4_t vi3x0246 = vmovq_n_f32(0.0f); float32x4_t vi4x0246 = vmovq_n_f32(0.0f); float32x4_t vi0x1357 = vmovq_n_f32(0.0f); float32x4_t vi1x1357 = vmovq_n_f32(0.0f); float32x4_t vi2x1357 = vmovq_n_f32(0.0f); float32x4_t vi3x1357 = vmovq_n_f32(0.0f); float32x4_t vi4x1357 = vmovq_n_f32(0.0f); float32x4x2_t vi0x8ACE9BDF = vld2q_f32(i0); i0 += 8; float32x4x2_t vi1x8ACE9BDF = vld2q_f32(i1); i1 += 8; float32x4x2_t vi2x8ACE9BDF = vld2q_f32(i2); i2 += 8; float32x4x2_t vi3x8ACE9BDF = vld2q_f32(i3); i3 += 8; float32x4x2_t vi4x8ACE9BDF = vld2q_f32(i4); i4 += 8; size_t w = input_width; for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[1], vwOP, 0); const float32x4_t vi0x68AC = vextq_f32(vi0x0246, vi0x8ACE9BDF.val[0], 3); vi0x0246 = vi0x8ACE9BDF.val[0]; const float32x4_t vi1x68AC = vextq_f32(vi1x0246, vi1x8ACE9BDF.val[0], 3); vi1x0246 = vi1x8ACE9BDF.val[0]; const float32x4_t vi2x68AC = vextq_f32(vi2x0246, vi2x8ACE9BDF.val[0], 3); vi2x0246 = vi2x8ACE9BDF.val[0]; const float32x4_t vi3x68AC = vextq_f32(vi3x0246, vi3x8ACE9BDF.val[0], 3); vi3x0246 = vi3x8ACE9BDF.val[0]; const float32x4_t vi4x68AC = vextq_f32(vi4x0246, vi4x8ACE9BDF.val[0], 3); vi4x0246 = vi4x8ACE9BDF.val[0]; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x68AC, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x68AC, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x68AC, vget_high_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x68AC, vget_low_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x68AC, vget_low_f32(vwKLMN), 1); const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x8ACE9BDF.val[1], 3); vi0x1357 = vi0x8ACE9BDF.val[1]; const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x8ACE9BDF.val[1], 3); vi1x1357 = vi1x8ACE9BDF.val[1]; const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x8ACE9BDF.val[1], 3); vi2x1357 = vi2x8ACE9BDF.val[1]; const float32x4_t vi3x79BD = vextq_f32(vi3x1357, vi3x8ACE9BDF.val[1], 3); vi3x1357 = vi3x8ACE9BDF.val[1]; const float32x4_t vi4x79BD = vextq_f32(vi4x1357, vi4x8ACE9BDF.val[1], 3); vi4x1357 = vi4x8ACE9BDF.val[1]; const float32x4x2_t vi0xGIKMHJLN = vld2q_f32(i0); i0 += 8; const float32x4x2_t vi1xGIKMHJLN = vld2q_f32(i1); i1 += 8; const float32x4x2_t vi2xGIKMHJLN = vld2q_f32(i2); i2 += 8; const float32x4x2_t vi3xGIKMHJLN = vld2q_f32(i3); i3 += 8; const float32x4x2_t vi4xGIKMHJLN = vld2q_f32(i4); i4 += 8; vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_low_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x79BD, vget_low_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x79BD, vget_high_f32(vwKLMN), 0); const float32x4_t vi0xACEG = vextq_f32(vi0x8ACE9BDF.val[0], vi0xGIKMHJLN.val[0], 1); vi0x8ACE9BDF = vi0xGIKMHJLN; const float32x4_t vi1xACEG = vextq_f32(vi1x8ACE9BDF.val[0], vi1xGIKMHJLN.val[0], 1); vi1x8ACE9BDF = vi1xGIKMHJLN; const float32x4_t vi2xACEG = vextq_f32(vi2x8ACE9BDF.val[0], vi2xGIKMHJLN.val[0], 1); vi2x8ACE9BDF = vi2xGIKMHJLN; const float32x4_t vi3xACEG = vextq_f32(vi3x8ACE9BDF.val[0], vi3xGIKMHJLN.val[0], 1); vi3x8ACE9BDF = vi3xGIKMHJLN; const float32x4_t vi4xACEG = vextq_f32(vi4x8ACE9BDF.val[0], vi4xGIKMHJLN.val[0], 1); vi4x8ACE9BDF = vi4xGIKMHJLN; vo0p0 = vmlaq_lane_f32(vo0p0, vi0xACEG, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1xACEG, vget_high_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2xACEG, vget_high_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3xACEG, vget_low_f32(vwKLMN), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4xACEG, vwOP, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); vst1q_f32(o0, vo0); o0 += 4; } // Last block has 1-8 pixels to process. assert(w <= 8 * sizeof(float)); assert(w >= 1 * sizeof(float)); { float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); const float32x4_t vi0x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[0]))); const float32x4_t vi1x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[0]))); const float32x4_t vi2x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[0]))); const float32x4_t vi3x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi3x8ACE9BDF.val[0]))); const float32x4_t vi4x8ACE = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi4x8ACE9BDF.val[0]))); const float32x4_t vi0x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0x8ACE9BDF.val[1]))); const float32x4_t vi1x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1x8ACE9BDF.val[1]))); const float32x4_t vi2x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2x8ACE9BDF.val[1]))); const float32x4_t vi3x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3x8ACE9BDF.val[1]))); const float32x4_t vi4x9BDF = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4x8ACE9BDF.val[1]))); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE, vget_high_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE, vget_low_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE, vget_low_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE, vget_high_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE, vget_high_f32(vwKLMN), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x9BDF, vget_low_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x9BDF, vget_low_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x9BDF, vget_high_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x9BDF, vget_high_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x9BDF, vwOP, 0); const float32x4_t vi0x68AC = vextq_f32(vi0x0246, vi0x8ACE, 3); const float32x4_t vi1x68AC = vextq_f32(vi1x0246, vi1x8ACE, 3); const float32x4_t vi2x68AC = vextq_f32(vi2x0246, vi2x8ACE, 3); const float32x4_t vi3x68AC = vextq_f32(vi3x0246, vi3x8ACE, 3); const float32x4_t vi4x68AC = vextq_f32(vi4x0246, vi4x8ACE, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x68AC, vget_low_f32(vw0123), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x68AC, vget_high_f32(vw4567), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x68AC, vget_high_f32(vw89AB), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x68AC, vget_low_f32(vwGHIJ), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x68AC, vget_low_f32(vwKLMN), 1); const float32x4_t vi0x79BD = vextq_f32(vi0x1357, vi0x9BDF, 3); const float32x4_t vi1x79BD = vextq_f32(vi1x1357, vi1x9BDF, 3); const float32x4_t vi2x79BD = vextq_f32(vi2x1357, vi2x9BDF, 3); const float32x4_t vi3x79BD = vextq_f32(vi3x1357, vi3x9BDF, 3); const float32x4_t vi4x79BD = vextq_f32(vi4x1357, vi4x9BDF, 3); vo0p0 = vmlaq_lane_f32(vo0p0, vi0x79BD, vget_high_f32(vw0123), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi1x79BD, vget_high_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi2x79BD, vget_low_f32(vwCDEF), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi3x79BD, vget_low_f32(vwGHIJ), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi4x79BD, vget_high_f32(vwKLMN), 0); const float32x4_t vzero = vmovq_n_f32(0.0f); const float32x4_t vi0xACEG = vextq_f32(vi0x8ACE, vzero, 1); const float32x4_t vi1xACEG = vextq_f32(vi1x8ACE, vzero, 1); const float32x4_t vi2xACEG = vextq_f32(vi2x8ACE, vzero, 1); const float32x4_t vi3xACEG = vextq_f32(vi3x8ACE, vzero, 1); const float32x4_t vi4xACEG = vextq_f32(vi4x8ACE, vzero, 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi0xACEG, vget_low_f32(vw4567), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi1xACEG, vget_high_f32(vw89AB), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi2xACEG, vget_high_f32(vwCDEF), 1); vo0p0 = vmlaq_lane_f32(vo0p0, vi3xACEG, vget_low_f32(vwKLMN), 0); vo0p0 = vmlaq_lane_f32(vo0p0, vi4xACEG, vwOP, 1); float32x4_t vo0 = vmaxq_f32(vo0p0, vmin); vo0 = vminq_f32(vo0, vmax); size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float)); if XNN_LIKELY(w_tmp >= 4) { vst1q_f32(o0, vo0); o0 += 4; } else { float32x2_t vo0_lo = vget_low_f32(vo0); if (w_tmp & 2) { vst1_f32(o0, vo0_lo); o0 += 2; vo0_lo = vget_high_f32(vo0); } if (w_tmp & 1) { vst1_lane_f32(o0, vo0_lo, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i4 - input_decrement); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_f16_vcvt_ukernel__neon_x8( size_t batch, const float* input, void* output, const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const uint32x4_t vexp_bias = vld1q_dup_u32(¶ms->neon.exp_bias); const float32x4_t vscale_to_inf = vld1q_dup_f32(¶ms->neon.scale_to_inf); const uint32x4_t vexpw_max = vld1q_dup_u32(¶ms->neon.expw_max); const float32x4_t vscale_to_zero = vld1q_dup_f32(¶ms->neon.scale_to_zero); const uint32x4_t vbias_min = vdupq_n_u32(UINT32_C(0x40000000)); const uint16x8_t vexph_mask = vdupq_n_u16(UINT16_C(0x7C00)); const uint16x8_t vmanth_mask = vdupq_n_u16(UINT16_C(0x0FFF)); const uint16x8_t vsignh_mask = vdupq_n_u16(UINT16_C(0x8000)); const uint16x8_t vnanh = vdupq_n_u16(UINT16_C(0x7E00)); uint16_t* o = (uint16_t*) output; for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0 = vld1q_f32(input); input += 4; const float32x4_t vx1 = vld1q_f32(input); input += 4; const float32x4_t vabsx0 = vabsq_f32(vx0); const float32x4_t vabsx1 = vabsq_f32(vx1); uint32x4_t vbias0 = vaddq_u32(vreinterpretq_u32_f32(vabsx0), vexp_bias); uint32x4_t vbias1 = vaddq_u32(vreinterpretq_u32_f32(vabsx1), vexp_bias); float32x4_t vf0 = vmulq_f32(vabsx0, vscale_to_inf); float32x4_t vf1 = vmulq_f32(vabsx1, vscale_to_inf); const uint32x4_t vnanmaskw0 = vcgtq_u32(vreinterpretq_u32_f32(vabsx0), vexpw_max); const uint32x4_t vnanmaskw1 = vcgtq_u32(vreinterpretq_u32_f32(vabsx1), vexpw_max); vbias0 = vandq_u32(vbias0, vexpw_max); vbias1 = vandq_u32(vbias1, vexpw_max); vf0 = vmulq_f32(vf0, vscale_to_zero); vf1 = vmulq_f32(vf1, vscale_to_zero); const uint16x8_t vnanmaskh0 = vcombine_u16(vmovn_u32(vnanmaskw0), vmovn_u32(vnanmaskw1)); vbias0 = vmaxq_u32(vbias0, vbias_min); vbias1 = vmaxq_u32(vbias1, vbias_min); vf0 = vaddq_f32(vf0, vreinterpretq_f32_u32(vbias0)); vf1 = vaddq_f32(vf1, vreinterpretq_f32_u32(vbias1)); uint16x8_t vexph0 = vcombine_u16(vshrn_n_u32(vreinterpretq_u32_f32(vf0), 13), vshrn_n_u32(vreinterpretq_u32_f32(vf1), 13)); uint16x8_t vmanth0 = vcombine_u16(vmovn_u32(vreinterpretq_u32_f32(vf0)), vmovn_u32(vreinterpretq_u32_f32(vf1))); uint16x8_t vsignh0 = vcombine_u16(vshrn_n_u32(vreinterpretq_u32_f32(vx0), 16), vshrn_n_u32(vreinterpretq_u32_f32(vx1), 16)); vexph0 = vandq_u16(vexph0, vexph_mask); vmanth0 = vandq_u16(vmanth0, vmanth_mask); vsignh0 = vandq_u16(vsignh0, vsignh_mask); uint16x8_t vh0 = vaddq_u16(vmanth0, vexph0); vh0 = vbslq_u16(vnanmaskh0, vnanh, vh0); vh0 = vorrq_u16(vh0, vsignh0); vst1q_u16(o, vh0); o += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vabsx = vabsq_f32(vx); uint32x4_t vbias = vaddq_u32(vreinterpretq_u32_f32(vabsx), vexp_bias); float32x4_t vf = vmulq_f32(vabsx, vscale_to_inf); const uint32x4_t vnanmaskw = vcgtq_u32(vreinterpretq_u32_f32(vabsx), vexpw_max); vbias = vandq_u32(vbias, vexpw_max); vf = vmulq_f32(vf, vscale_to_zero); const uint16x4_t vnanmaskh = vmovn_u32(vnanmaskw); vbias = vmaxq_u32(vbias, vbias_min); vf = vaddq_f32(vf, vreinterpretq_f32_u32(vbias)); uint16x4_t vexph = vshrn_n_u32(vreinterpretq_u32_f32(vf), 13); uint16x4_t vmanth = vmovn_u32(vreinterpretq_u32_f32(vf)); uint16x4_t vsignh = vshrn_n_u32(vreinterpretq_u32_f32(vx), 16); vexph = vand_u16(vexph, vget_low_u16(vexph_mask)); vmanth = vand_u16(vmanth, vget_low_u16(vmanth_mask)); vsignh = vand_u16(vsignh, vget_low_u16(vsignh_mask)); uint16x4_t vh = vadd_u16(vmanth, vexph); vh = vbsl_u16(vnanmaskh, vget_low_u16(vnanh), vh); vh = vorr_u16(vh, vsignh); vst1_u16(o, vh); o += 4; } if XNN_UNLIKELY(batch != 0) { assert(batch % sizeof(float) == 0); assert(batch >= 1 * sizeof(float)); assert(batch <= 3 * sizeof(float)); const float32x4_t vx = vld1q_f32(input); const float32x4_t vabsx = vabsq_f32(vx); uint32x4_t vbias = vaddq_u32(vreinterpretq_u32_f32(vabsx), vexp_bias); float32x4_t vf = vmulq_f32(vabsx, vscale_to_inf); const uint32x4_t vnanmaskw = vcgtq_u32(vreinterpretq_u32_f32(vabsx), vexpw_max); vbias = vandq_u32(vbias, vexpw_max); vf = vmulq_f32(vf, vscale_to_zero); const uint16x4_t vnanmaskh = vmovn_u32(vnanmaskw); vbias = vmaxq_u32(vbias, vbias_min); vf = vaddq_f32(vf, vreinterpretq_f32_u32(vbias)); uint16x4_t vexph = vshrn_n_u32(vreinterpretq_u32_f32(vf), 13); uint16x4_t vmanth = vmovn_u32(vreinterpretq_u32_f32(vf)); uint16x4_t vsignh = vshrn_n_u32(vreinterpretq_u32_f32(vx), 16); vexph = vand_u16(vexph, vget_low_u16(vexph_mask)); vmanth = vand_u16(vmanth, vget_low_u16(vmanth_mask)); vsignh = vand_u16(vsignh, vget_low_u16(vsignh_mask)); uint16x4_t vh = vadd_u16(vmanth, vexph); vh = vbsl_u16(vnanmaskh, vget_low_u16(vnanh), vh); vh = vorr_u16(vh, vsignh); if (batch & (2 * sizeof(float))) { vst1_lane_u32((void*) o, vreinterpret_u32_u16(vh), 0); o += 2; vh = vext_u16(vh, vh, 2); } if (batch & (1 * sizeof(float))) { vst1_lane_u16(o, vh, 0); } } } void xnn_f32_gavgpool_cw_ukernel__neon_x4( size_t elements, size_t channels, const float* input, float* output, const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(elements != 0); assert(elements % sizeof(float) == 0); assert(channels != 0); const uint32x4_t vmask = vld1q_u32(params->neon.mask); const float32x2_t vmultiplier = vld1_dup_f32(¶ms->neon.multiplier); const float32x2_t voutput_min = vld1_dup_f32(¶ms->neon.output_min); const float32x2_t voutput_max = vld1_dup_f32(¶ms->neon.output_max); do { float32x4_t vsum0 = vmovq_n_f32(0.0f); size_t n = elements; if (n >= 16 * sizeof(float)) { float32x4_t vsum1 = vmovq_n_f32(0.0f); do { const float32x4_t vi0 = vld1q_f32(input); const float32x4_t vi1 = vld1q_f32(input + 4); const float32x4_t vi2 = vld1q_f32(input + 8); const float32x4_t vi3 = vld1q_f32(input + 12); input += 16; const float32x4_t acc0 = vaddq_f32(vi0, vi1); const float32x4_t acc1 = vaddq_f32(vi2, vi3); vsum0 = vaddq_f32(vsum0, acc0); vsum1 = vaddq_f32(vsum1, acc1); n -= 16 * sizeof(float); } while (n >= 32 * sizeof(float)); vsum0 = vaddq_f32(vsum0, vsum1); } while (n >= 4 * sizeof(float)) { const float32x4_t vi0 = vld1q_f32(input); input += 4; vsum0 = vaddq_f32(vsum0, vi0); n -= 4 * sizeof(float); } if XNN_UNLIKELY(n != 0) { float32x4_t vi0 = vld1q_f32(input); input = (const float*) ((uintptr_t) input + n); vi0 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0))); vsum0 = vaddq_f32(vsum0, vi0); } const float32x2_t vout2 = vpadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); const float32x2_t vout1 = vpadd_f32(vout2, vout2); float32x2_t vout = vmul_f32(vout1, vmultiplier); vout = vmax_f32(vout, voutput_min); vout = vmin_f32(vout, voutput_max); vst1_lane_f32(output, vout, 0); output += 1; } while (--channels != 0); } void xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* buffer, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); const size_t packed_channels = round_up_po2(channels, 4); const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); vst1q_f32(b, vsum); b += 4; } for (rows -= 7; rows > 7; rows -= 7) { b = buffer; i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); i2 = (const float*) ((uintptr_t) i2 + input_increment); i3 = (const float*) ((uintptr_t) i3 + input_increment); i4 = (const float*) ((uintptr_t) i4 + input_increment); i5 = (const float*) ((uintptr_t) i5 + input_increment); i6 = (const float*) ((uintptr_t) i6 + input_increment); for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum6a = vaddq_f32(vi6, vacc); const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); vst1q_f32(b, vsum); b += 4; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); if (rows < 2) { i1 = zero; } i2 = (const float*) ((uintptr_t) i2 + input_increment); if (rows <= 2) { i2 = zero; } i3 = (const float*) ((uintptr_t) i3 + input_increment); if (rows < 4) { i3 = zero; } i4 = (const float*) ((uintptr_t) i4 + input_increment); if (rows <= 4) { i4 = zero; } i5 = (const float*) ((uintptr_t) i5 + input_increment); if (rows < 6) { i5 = zero; } i6 = (const float*) ((uintptr_t) i6 + input_increment); if (rows <= 6) { i6 = zero; } const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); b = buffer; while (channels >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vacc = vld1q_f32(b); b += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum6a = vaddq_f32(vi6, vacc); const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); vst1q_f32(output, vout); output += 4; channels -= 4; } if (channels != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum6a = vaddq_f32(vi6, vacc); const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); float32x2_t vout_lo = vget_low_f32(vout); if (channels & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (channels & 1) { vst1_lane_f32(output, vout_lo, 0); } } } void xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); if (rows < 2) { i1 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); if (rows <= 2) { i2 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); if (rows < 4) { i3 = zero; } const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); if (rows <= 4) { i4 = zero; } const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); if (rows < 6) { i5 = zero; } const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); if (rows <= 6) { i6 = zero; } const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); while (channels >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); vst1q_f32(output, vout); output += 4; channels -= 4; } if (channels != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); float32x4_t vout = vmulq_f32(vsum, vscale); vout = vmaxq_f32(vout, vmin); vout = vminq_f32(vout, vmax); float32x2_t vout_lo = vget_low_f32(vout); if (channels & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (channels & 1) { vst1_lane_f32(output, vout_lo, 0); } } } void xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { float32x2_t vacc0x01 = vld1_f32(w); w += 2; float32x2_t vacc1x01 = vacc0x01; float32x2_t vacc2x01 = vacc0x01; float32x2_t vacc3x01 = vacc0x01; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const float32x4_t vb01c01 = vld1q_f32(w); w += 4; const float32x2_t vb01c0 = vget_low_f32(vb01c01); const float32x2_t vb01c1 = vget_high_f32(vb01c01); vacc0x01 = vmla_lane_f32(vacc0x01, vb01c0, va0, 0); vacc1x01 = vmla_lane_f32(vacc1x01, vb01c0, va1, 0); vacc2x01 = vmla_lane_f32(vacc2x01, vb01c0, va2, 0); vacc3x01 = vmla_lane_f32(vacc3x01, vb01c0, va3, 0); vacc0x01 = vmla_lane_f32(vacc0x01, vb01c1, va0, 1); vacc1x01 = vmla_lane_f32(vacc1x01, vb01c1, va1, 1); vacc2x01 = vmla_lane_f32(vacc2x01, vb01c1, va2, 1); vacc3x01 = vmla_lane_f32(vacc3x01, vb01c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x2_t va0 = vld1_dup_f32(a0); a0 += 1; const float32x2_t va1 = vld1_dup_f32(a1); a1 += 1; const float32x2_t va2 = vld1_dup_f32(a2); a2 += 1; const float32x2_t va3 = vld1_dup_f32(a3); a3 += 1; const float32x2_t vb01 = vld1_f32(w); w += 2; vacc0x01 = vmla_f32(vacc0x01, va0, vb01); vacc1x01 = vmla_f32(vacc1x01, va1, vb01); vacc2x01 = vmla_f32(vacc2x01, va2, vb01); vacc3x01 = vmla_f32(vacc3x01, va3, vb01); } const float32x2_t vmax = vld1_dup_f32(¶ms->scalar.max); vacc0x01 = vmin_f32(vacc0x01, vmax); vacc1x01 = vmin_f32(vacc1x01, vmax); vacc2x01 = vmin_f32(vacc2x01, vmax); vacc3x01 = vmin_f32(vacc3x01, vmax); const float32x2_t vmin = vld1_dup_f32(¶ms->scalar.min); vacc0x01 = vmax_f32(vacc0x01, vmin); vacc1x01 = vmax_f32(vacc1x01, vmin); vacc2x01 = vmax_f32(vacc2x01, vmin); vacc3x01 = vmax_f32(vacc3x01, vmin); if XNN_LIKELY(nc >= 2) { vst1_f32(c0, vacc0x01); c0 = (float*) ((uintptr_t) c0 + cn_stride); vst1_f32(c1, vacc1x01); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1_f32(c2, vacc2x01); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1_f32(c3, vacc3x01); c3 = (float*) ((uintptr_t) c3 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); nc -= 2; } else { assert(nc == 1); vst1_lane_f32(c0, vacc0x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c3, vacc3x01, 0); nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t k = kc; if XNN_LIKELY(k >= 4 * sizeof(float)) { do { const float32x4_t va0 = vld1q_f32(a0); a0 += 4; const float32x4_t va1 = vld1q_f32(a1); a1 += 4; const float32x4_t va2 = vld1q_f32(a2); a2 += 4; const float32x4_t va3 = vld1q_f32(a3); a3 += 4; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); k -= 4 * sizeof(float); } while (k >= 4 * sizeof(float)); } if XNN_UNLIKELY(k != 0) { if XNN_UNLIKELY(k & (2 * sizeof(float))) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1); } if XNN_UNLIKELY(k & (1 * sizeof(float))) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1; const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1; const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1; const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); } } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1; const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1; const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1; const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); } const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_ibilinear_chw_ukernel__neon_p8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t input_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(input_increment % sizeof(float) == 0); do { const float** i = input; const float* w = weights; size_t p = output_pixels; for (; p >= 8; p -= 8) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset); const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset); const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset); const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset); const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset); const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset); const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset); const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset); i += 2 * 8; const float32x4x2_t vw0123 = vld2q_f32(w + 0); const float32x4x2_t vw4567 = vld2q_f32(w + 8); w += 2 * 8; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vtltr2 = vld1_f32(itl2); const float32x2_t vblbr2 = vld1_f32(ibl2); const float32x2_t vtltr3 = vld1_f32(itl3); const float32x2_t vblbr3 = vld1_f32(ibl3); const float32x2_t vtltr4 = vld1_f32(itl4); const float32x2_t vblbr4 = vld1_f32(ibl4); const float32x2_t vtltr5 = vld1_f32(itl5); const float32x2_t vblbr5 = vld1_f32(ibl5); const float32x2_t vtltr6 = vld1_f32(itl6); const float32x2_t vblbr6 = vld1_f32(ibl6); const float32x2_t vtltr7 = vld1_f32(itl7); const float32x2_t vblbr7 = vld1_f32(ibl7); const float32x4_t valphah0123 = vw0123.val[0]; const float32x4_t valphav0123 = vw0123.val[1]; const float32x4_t valphah4567 = vw4567.val[0]; const float32x4_t valphav4567 = vw4567.val[1]; const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5); const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5); const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7); const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7); const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45); const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67); const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23); const float32x4_t vld0123 = vld_t0123.val[0]; const float32x4_t vrd0123 = vld_t0123.val[1]; const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67); const float32x4_t vld4567 = vld_t4567.val[0]; const float32x4_t vrd4567 = vld_t4567.val[1]; const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23); const float32x4_t vtl0123 = vtl_t0123.val[0]; const float32x4_t vtr0123 = vtl_t0123.val[1]; const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67); const float32x4_t vtl4567 = vtl_t4567.val[0]; const float32x4_t vtr4567 = vtl_t4567.val[1]; const float32x4_t vl0123 = vmlaq_f32(vtl0123, vld0123, valphav0123); const float32x4_t vr0123 = vmlaq_f32(vtr0123, vrd0123, valphav0123); const float32x4_t vl4567 = vmlaq_f32(vtl4567, vld4567, valphav4567); const float32x4_t vr4567 = vmlaq_f32(vtr4567, vrd4567, valphav4567); const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123); const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567); const float32x4_t vo0123 = vmlaq_f32(vl0123, vd0123, valphah0123); const float32x4_t vo4567 = vmlaq_f32(vl4567, vd4567, valphah4567); vst1q_f32(output + 0, vo0123); vst1q_f32(output + 4, vo4567); output += 8; } for (; p >= 4; p -= 4) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); i += 8; const float32x4x2_t vw = vld2q_f32(w); w += 8; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vtltr2 = vld1_f32(itl2); const float32x2_t vblbr2 = vld1_f32(ibl2); const float32x2_t vtltr3 = vld1_f32(itl3); const float32x2_t vblbr3 = vld1_f32(ibl3); const float32x4_t valphah = vw.val[0]; const float32x4_t valphav = vw.val[1]; const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1); const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1); const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3); const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3); const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01); const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23); const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23); const float32x4_t vld = vld_t.val[0]; const float32x4_t vrd = vld_t.val[1]; const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23); const float32x4_t vtl = vtl_t.val[0]; const float32x4_t vtr = vtl_t.val[1]; const float32x4_t vl = vmlaq_f32(vtl, vld, valphav); const float32x4_t vr = vmlaq_f32(vtr, vrd, valphav); const float32x4_t vd = vsubq_f32(vr, vl); const float32x4_t vo = vmlaq_f32(vl, vd, valphah); vst1q_f32(output, vo); output += 4; } if XNN_UNLIKELY(p != 0) { if (p & 2) { const float32x2x2_t vw = vld2_f32(w); w += 4; const float32x2_t valphah = vw.val[0]; const float32x2_t valphav = vw.val[1]; const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); i += 4; const float32x2_t vtltr0 = vld1_f32(itl0); const float32x2_t vblbr0 = vld1_f32(ibl0); const float32x2_t vtltr1 = vld1_f32(itl1); const float32x2_t vblbr1 = vld1_f32(ibl1); const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0); const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1); const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1); const float32x2_t vld = vld_t.val[0]; const float32x2_t vrd = vld_t.val[1]; const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1); const float32x2_t vtl = vtl_t.val[0]; const float32x2_t vtr = vtl_t.val[1]; const float32x2_t vl = vmla_f32(vtl, vld, valphav); const float32x2_t vr = vmla_f32(vtr, vrd, valphav); const float32x2_t vd = vsub_f32(vr, vl); const float32x2_t vo = vmla_f32(vl, vd, valphah); vst1_f32(output, vo); output += 2; } if (p & 1) { // We are computing the following formula: // result = (1 - alpha_h) * (1 - alpha_v) * top_left + // alpha_h * (1 - alpha_v) * top_right + // (1 - alpha_h) * alpha_v * bottom_left + // alpha_h * alpha_v * bottom_right. // // Rearranging gives // result = left + alpha_h * (right - left), // where // left = top_left + alpha_v * (bottom_left - top_left), // right = top_right + alpha_v * (bottom_right - top_right). const float alphah = *w; const float32x2_t valphav = vld1_dup_f32(w + 1); w += 2; const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); i += 2; const float32x2_t vtltr = vld1_f32(itl); const float32x2_t vblbr = vld1_f32(ibl); // Compute at once // left_diff = bottom_left - top_left // right_diff = bottom_right - top_right const float32x2_t vldrd = vsub_f32(vblbr, vtltr); const float32x2_t vlr = vmla_f32(vtltr, vldrd, valphav); // Extract them and compute the result. const float l = vget_lane_f32(vlr, 0); const float r = vget_lane_f32(vlr, 1); *output++ = l + alphah * (r - l); } } input_offset += input_increment; } while (--channels != 0); } void xnn_f32_ibilinear_ukernel__neon_c8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); do { const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset); const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset); const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset); const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset); input += 4; const float32x2_t valphahv = vld1_f32(weights); weights += 2; size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vtl4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr4567 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl4567 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr4567 = vld1q_f32(i3); i3 += 4; const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); const float32x4_t vtd4567 = vsubq_f32(vtr4567, vtl4567); const float32x4_t vbd4567 = vsubq_f32(vbr4567, vbl4567); const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); const float32x4_t vt4567 = vmlaq_lane_f32(vtl4567, vtd4567, valphahv, 0); const float32x4_t vb4567 = vmlaq_lane_f32(vbl4567, vbd4567, valphahv, 0); const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); const float32x4_t vd4567 = vsubq_f32(vb4567, vt4567); const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); const float32x4_t vo4567 = vmlaq_lane_f32(vt4567, vd4567, valphahv, 1); vst1q_f32(output, vo0123); output += 4; vst1q_f32(output, vo4567); output += 4; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const float32x4_t vtl0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vtr0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vbl0123 = vld1q_f32(i2); i2 += 4; const float32x4_t vbr0123 = vld1q_f32(i3); i3 += 4; const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); vst1q_f32(output, vo0123); output += 4; } if XNN_UNLIKELY(c != 0) { const float32x4_t vtl0123 = vld1q_f32(i0); const float32x4_t vtr0123 = vld1q_f32(i1); const float32x4_t vbl0123 = vld1q_f32(i2); const float32x4_t vbr0123 = vld1q_f32(i3); const float32x4_t vtd0123 = vsubq_f32(vtr0123, vtl0123); const float32x4_t vbd0123 = vsubq_f32(vbr0123, vbl0123); const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); float32x2_t vo01 = vget_low_f32(vo0123); if (c & (2 * sizeof(float))) { vst1_f32(output, vo01); output += 2; vo01 = vget_high_f32(vo0123); } if (c & (1 * sizeof(float))) { vst1_lane_f32(output, vo01, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); } p -= 1 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { float32x2_t vacc0x01 = vld1_f32(w); w += 2; float32x2_t vacc1x01 = vacc0x01; float32x2_t vacc2x01 = vacc0x01; float32x2_t vacc3x01 = vacc0x01; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const float32x2_t vb01c0 = vld1_f32(w); w += 2; vacc0x01 = vmla_lane_f32(vacc0x01, vb01c0, va0, 0); vacc1x01 = vmla_lane_f32(vacc1x01, vb01c0, va1, 0); vacc2x01 = vmla_lane_f32(vacc2x01, vb01c0, va2, 0); vacc3x01 = vmla_lane_f32(vacc3x01, vb01c0, va3, 0); const float32x2_t vb01c1 = vld1_f32(w); w += 2; vacc0x01 = vmla_lane_f32(vacc0x01, vb01c1, va0, 1); vacc1x01 = vmla_lane_f32(vacc1x01, vb01c1, va1, 1); vacc2x01 = vmla_lane_f32(vacc2x01, vb01c1, va2, 1); vacc3x01 = vmla_lane_f32(vacc3x01, vb01c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x2_t va0 = vld1_dup_f32(a0); const float32x2_t va1 = vld1_dup_f32(a1); const float32x2_t va2 = vld1_dup_f32(a2); const float32x2_t va3 = vld1_dup_f32(a3); const float32x2_t vb01 = vld1_f32(w); w += 2; vacc0x01 = vmla_f32(vacc0x01, va0, vb01); vacc1x01 = vmla_f32(vacc1x01, va1, vb01); vacc2x01 = vmla_f32(vacc2x01, va2, vb01); vacc3x01 = vmla_f32(vacc3x01, va3, vb01); } p -= 4 * sizeof(void*); } while (p != 0); const float32x2_t vmax = vld1_dup_f32(¶ms->scalar.max); vacc0x01 = vmin_f32(vacc0x01, vmax); vacc1x01 = vmin_f32(vacc1x01, vmax); vacc2x01 = vmin_f32(vacc2x01, vmax); vacc3x01 = vmin_f32(vacc3x01, vmax); const float32x2_t vmin = vld1_dup_f32(¶ms->scalar.min); vacc0x01 = vmax_f32(vacc0x01, vmin); vacc1x01 = vmax_f32(vacc1x01, vmin); vacc2x01 = vmax_f32(vacc2x01, vmin); vacc3x01 = vmax_f32(vacc3x01, vmin); if XNN_LIKELY(nc >= 2) { vst1_f32(c3, vacc3x01); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1_f32(c2, vacc2x01); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1_f32(c1, vacc1x01); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1_f32(c0, vacc0x01); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 2; } else { assert(nc == 1); vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(a0); a0 += 4; const float32x4_t va1 = vld1q_f32(a1); a1 += 4; const float32x4_t va2 = vld1q_f32(a2); a2 += 4; const float32x4_t va3 = vld1q_f32(a3); a3 += 4; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1); const float32x4_t vb0123c2 = vld1q_f32(w); w += 4; const float32x4_t vb4567c2 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0); const float32x4_t vb0123c3 = vld1q_f32(w); w += 4; const float32x4_t vb4567c3 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1); } if XNN_UNLIKELY(k != 0) { do { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1; const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1; const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1; const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); k -= sizeof(float); } while (k != 0); } p -= 4 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x4567 = vld1q_f32(w); w += 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const float32x4_t vb0123c0 = vld1q_f32(w); w += 4; const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); const float32x4_t vb0123c1 = vld1q_f32(w); w += 4; const float32x4_t vb4567c1 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); const float32x4_t va1 = vld1q_dup_f32(a1); const float32x4_t va2 = vld1q_dup_f32(a2); const float32x4_t va3 = vld1q_dup_f32(a3); const float32x4_t vb0123 = vld1q_f32(w); w += 4; const float32x4_t vb4567 = vld1q_f32(w); w += 4; vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); } p -= 4 * sizeof(void*); } while (p != 0); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); do { float* o = output; { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; const float* i8 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vmax018 = vmaxq_f32(vmaxq_f32(vi0, vi1), vi8); const float32x4_t vmax23 = vmaxq_f32(vi2, vi3); const float32x4_t vmax45 = vmaxq_f32(vi4, vi5); const float32x4_t vmax67 = vmaxq_f32(vi6, vi7); const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45); const float32x4_t vmax01678 = vmaxq_f32(vmax018, vmax67); const float32x4_t vmax = vmaxq_f32(vmax2345, vmax01678); const float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min); vst1q_f32(o, vout); o += 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vmax018 = vmaxq_f32(vmaxq_f32(vi0, vi1), vi8); const float32x4_t vmax23 = vmaxq_f32(vi2, vi3); const float32x4_t vmax45 = vmaxq_f32(vi4, vi5); const float32x4_t vmax67 = vmaxq_f32(vi6, vi7); const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45); const float32x4_t vmax01678 = vmaxq_f32(vmax018, vmax67); const float32x4_t vmax = vmaxq_f32(vmax2345, vmax01678); float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(o, vout_lo); o += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(o, vout_lo, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 4; c -= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vo = vld1q_f32(o); const float32x4_t vmax01 = vmaxq_f32(vmaxq_f32(vi0, vi1), vo); const float32x4_t vmax23 = vmaxq_f32(vi2, vi3); const float32x4_t vmax45 = vmaxq_f32(vi4, vi5); const float32x4_t vmax67 = vmaxq_f32(vi6, vi7); const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45); const float32x4_t vmax0167 = vmaxq_f32(vmax01, vmax67); const float32x4_t vmax = vmaxq_f32(vmax2345, vmax0167); const float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min); vst1q_f32(o, vout); o += 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vo = vld1q_f32(o); const float32x4_t vmax01 = vmaxq_f32(vmaxq_f32(vi0, vi1), vo); const float32x4_t vmax23 = vmaxq_f32(vi2, vi3); const float32x4_t vmax45 = vmaxq_f32(vi4, vi5); const float32x4_t vmax67 = vmaxq_f32(vi6, vi7); const float32x4_t vmax2345 = vmaxq_f32(vmax23, vmax45); const float32x4_t vmax0167 = vmaxq_f32(vmax01, vmax67); const float32x4_t vmax = vmaxq_f32(vmax2345, vmax0167); float32x4_t vout = vmaxq_f32(vminq_f32(vmax, voutput_max), voutput_min); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(o, vout_lo); o += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(o, vout_lo, 0); o += 1; } } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); vst1q_f32(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); vst1q_f32(b, vsum); b += 4; } } { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float32x4_t vmultiplier = vld1q_dup_f32(multiplier); multiplier += 1; size_t c = channels; float* b = buffer; while (c >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vacc = vld1q_f32(b); b += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); float32x4_t vout = vmulq_f32(vsum, vmultiplier); vout = vmaxq_f32(vout, voutput_min); vout = vminq_f32(vout, voutput_max); vst1q_f32(output, vout); output += 4; c -= 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vacc = vld1q_f32(b); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum01a = vaddq_f32(vsum01, vacc); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum0167a = vaddq_f32(vsum01a, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum0167a); float32x4_t vout = vmulq_f32(vsum, vmultiplier); vout = vmaxq_f32(vout, voutput_min); vout = vminq_f32(vout, voutput_max); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(output, vout_lo, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const float32x4_t vmultiplier = vld1q_dup_f32(multiplier); multiplier += 1; size_t c = channels; while (c >= 4) { const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; const float32x4_t vi7 = vld1q_f32(i7); i7 += 4; const float32x4_t vi8 = vld1q_f32(i8); i8 += 4; const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); float32x4_t vout = vmulq_f32(vsum, vmultiplier); vout = vmaxq_f32(vout, voutput_min); vout = vminq_f32(vout, voutput_max); vst1q_f32(output, vout); output += 4; c -= 4; } if (c != 0) { const float32x4_t vi0 = vld1q_f32(i0); const float32x4_t vi1 = vld1q_f32(i1); const float32x4_t vi2 = vld1q_f32(i2); const float32x4_t vi3 = vld1q_f32(i3); const float32x4_t vi4 = vld1q_f32(i4); const float32x4_t vi5 = vld1q_f32(i5); const float32x4_t vi6 = vld1q_f32(i6); const float32x4_t vi7 = vld1q_f32(i7); const float32x4_t vi8 = vld1q_f32(i8); const float32x4_t vsum01 = vaddq_f32(vi0, vi1); const float32x4_t vsum23 = vaddq_f32(vi2, vi3); const float32x4_t vsum45 = vaddq_f32(vi4, vi5); const float32x4_t vsum67 = vaddq_f32(vi6, vi7); const float32x4_t vsum018 = vaddq_f32(vsum01, vi8); const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); const float32x4_t vsum01678 = vaddq_f32(vsum018, vsum67); const float32x4_t vsum = vaddq_f32(vsum2345, vsum01678); float32x4_t vout = vmulq_f32(vsum, vmultiplier); vout = vmaxq_f32(vout, voutput_min); vout = vminq_f32(vout, voutput_max); float32x2_t vout_lo = vget_low_f32(vout); if (c & 2) { vst1_f32(output, vout_lo); output += 2; vout_lo = vget_high_f32(vout); } if (c & 1) { vst1_lane_f32(output, vout_lo, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_prelu_ukernel__neon_2x8( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const float32x4_t vw0123 = vld1q_f32(w); w += 4; const float32x4_t vw4567 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); float32x4_t vacc0x4567 = vmulq_f32(vi0x4567, vw4567); const uint32x4_t vm0x4567 = vcltq_s32(vreinterpretq_s32_f32(vi0x4567), vmovq_n_s32(0)); float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); float32x4_t vacc1x4567 = vmulq_f32(vi1x4567, vw4567); const uint32x4_t vm1x4567 = vcltq_s32(vreinterpretq_s32_f32(vi1x4567), vmovq_n_s32(0)); vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); vacc0x4567 = vbslq_f32(vm0x4567, vacc0x4567, vi0x4567); vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); vacc1x4567 = vbslq_f32(vm1x4567, vacc1x4567, vi1x4567); vst1q_f32(o0, vacc0x0123); o0 += 4; vst1q_f32(o0, vacc0x4567); o0 += 4; vst1q_f32(o1, vacc1x0123); o1 += 4; vst1q_f32(o1, vacc1x4567); o1 += 4; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const float32x4_t vw0123 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4; const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4; float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); vst1q_f32(o0, vacc0x0123); o0 += 4; vst1q_f32(o1, vacc1x0123); o1 += 4; } if XNN_UNLIKELY(c != 0) { const float32x4_t vw0123 = vld1q_f32(w); w += 4; const float32x4_t vi0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c); const float32x4_t vi1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); float32x4_t vacc0x0123 = vmulq_f32(vi0x0123, vw0123); const uint32x4_t vm0x0123 = vcltq_s32(vreinterpretq_s32_f32(vi0x0123), vmovq_n_s32(0)); float32x4_t vacc1x0123 = vmulq_f32(vi1x0123, vw0123); const uint32x4_t vm1x0123 = vcltq_s32(vreinterpretq_s32_f32(vi1x0123), vmovq_n_s32(0)); vacc0x0123 = vbslq_f32(vm0x0123, vacc0x0123, vi0x0123); vacc1x0123 = vbslq_f32(vm1x0123, vacc1x0123, vi1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); if (c & (2 * sizeof(float))) { vst1_f32(o0, vacc0x01); o0 += 2; vst1_f32(o1, vacc1x01); o1 += 2; vacc0x01 = vget_high_f32(vacc0x0123); vacc1x01 = vget_high_f32(vacc1x0123); } if (c & (1 * sizeof(float))) { vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_qc4w_gemm_minmax_ukernel_1x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const int16x8_t vminus_kernel_zero_point = vld1q_dup_s16(¶ms->scalar.minus_kernel_zero_point[0]); const uint8x8_t vmask = vmov_n_u8(UINT8_C(0xF)); do { float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const uint8x8_t vw01234567c01 = vld1_u8(w); w = (const uint8_t*) w + 8; const uint8x8_t vw01234567c0 = vand_u8(vw01234567c01, vmask); const uint8x8_t vw01234567c1 = vshr_n_u8(vw01234567c01, 4); const int16x8_t vxw01234567c0 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c0)); const int16x8_t vxw01234567c1 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c1)); const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0)); const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0)); const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1)); const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1)); const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0); const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1); const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0); const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const uint8x8_t vw01234567 = vld1_u8(w); w = (const uint8_t*) w + 8; const int16x8_t vxw01234567 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567)); const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01234567)); const int32x4_t vxw4567 = vmovl_s16(vget_high_s16(vxw01234567)); const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123); const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567); vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); } const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc4w_gemm_minmax_ukernel_4x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_qc4w_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const int16x8_t vminus_kernel_zero_point = vld1q_dup_s16(¶ms->scalar.minus_kernel_zero_point[0]); const uint8x8_t vmask = vmov_n_u8(UINT8_C(0xF)); do { float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const uint8x8_t vw01234567c01 = vld1_u8(w); w = (const uint8_t*) w + 8; const uint8x8_t vw01234567c0 = vand_u8(vw01234567c01, vmask); const uint8x8_t vw01234567c1 = vshr_n_u8(vw01234567c01, 4); const int16x8_t vxw01234567c0 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c0)); const int16x8_t vxw01234567c1 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567c1)); const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0)); const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0)); const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1)); const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1)); const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0); const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1); const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0); const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1; const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1; const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1; const uint8x8_t vw01234567 = vld1_u8(w); w = (const uint8_t*) w + 8; const int16x8_t vxw01234567 = vaddw_s8(vminus_kernel_zero_point, vreinterpret_s8_u8(vw01234567)); const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01234567)); const int32x4_t vxw4567 = vmovl_s16(vget_high_s16(vxw01234567)); const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123); const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567); vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); } const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); vacc2x0123 = vmulq_f32(vacc2x0123, vscale0123); vacc3x0123 = vmulq_f32(vacc3x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567); vacc1x4567 = vmulq_f32(vacc1x4567, vscale4567); vacc2x4567 = vmulq_f32(vacc2x4567, vscale4567); vacc3x4567 = vmulq_f32(vacc3x4567, vscale4567); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_1x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const int8x8_t vw01234567c0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vw01234567c1 = vld1_s8(w); w = (const int8_t*) w + 8; const int16x8_t vxw01234567c0 = vmovl_s8(vw01234567c0); const int16x8_t vxw01234567c1 = vmovl_s8(vw01234567c1); const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0)); const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0)); const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1)); const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1)); const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0); const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1); const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0); const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const int8x8_t vw01230123 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4; const int8x8_t vw45674567 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4; const int16x8_t vxw01230123 = vmovl_s8(vw01230123); const int16x8_t vxw45674567 = vmovl_s8(vw45674567); const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01230123)); const int32x4_t vxw4567 = vmovl_s16(vget_low_s16(vxw45674567)); const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123); const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567); vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); } const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vacc0x0123); c0 += 4; vacc0x0123 = vacc0x4567; } float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c0, vacc0x01); c0 += 2; vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_4x8__neon_lane_ld64( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { float32x4_t vacc0x0123 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc0x4567 = vld1q_f32(w); w = (const float*) w + 4; float32x4_t vacc1x0123 = vacc0x0123; float32x4_t vacc1x4567 = vacc0x4567; float32x4_t vacc2x0123 = vacc0x0123; float32x4_t vacc2x4567 = vacc0x4567; float32x4_t vacc3x0123 = vacc0x0123; float32x4_t vacc3x4567 = vacc0x4567; size_t k = kc; for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) { const float32x2_t va0 = vld1_f32(a0); a0 += 2; const float32x2_t va1 = vld1_f32(a1); a1 += 2; const float32x2_t va2 = vld1_f32(a2); a2 += 2; const float32x2_t va3 = vld1_f32(a3); a3 += 2; const int8x8_t vw01234567c0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vw01234567c1 = vld1_s8(w); w = (const int8_t*) w + 8; const int16x8_t vxw01234567c0 = vmovl_s8(vw01234567c0); const int16x8_t vxw01234567c1 = vmovl_s8(vw01234567c1); const int32x4_t vxw0123c0 = vmovl_s16(vget_low_s16(vxw01234567c0)); const int32x4_t vxw4567c0 = vmovl_s16(vget_high_s16(vxw01234567c0)); const int32x4_t vxw0123c1 = vmovl_s16(vget_low_s16(vxw01234567c1)); const int32x4_t vxw4567c1 = vmovl_s16(vget_high_s16(vxw01234567c1)); const float32x4_t vb0123c0 = vcvtq_f32_s32(vxw0123c0); const float32x4_t vb0123c1 = vcvtq_f32_s32(vxw0123c1); const float32x4_t vb4567c0 = vcvtq_f32_s32(vxw4567c0); const float32x4_t vb4567c1 = vcvtq_f32_s32(vxw4567c1); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1); vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1); vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1); vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1); vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1); vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1); vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1); vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1); } if XNN_UNLIKELY(k != 0) { const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1; const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1; const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1; const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1; const int8x8_t vw01230123 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4; const int8x8_t vw45674567 = vreinterpret_s8_u32(vld1_dup_u32(w)); w = (const int8_t*) w + 4; const int16x8_t vxw01230123 = vmovl_s8(vw01230123); const int16x8_t vxw45674567 = vmovl_s8(vw45674567); const int32x4_t vxw0123 = vmovl_s16(vget_low_s16(vxw01230123)); const int32x4_t vxw4567 = vmovl_s16(vget_low_s16(vxw45674567)); const float32x4_t vb0123 = vcvtq_f32_s32(vxw0123); const float32x4_t vb4567 = vcvtq_f32_s32(vxw4567); vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123); vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123); vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123); vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123); vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567); vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567); vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567); vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567); } const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); vacc2x0123 = vmulq_f32(vacc2x0123, vscale0123); vacc3x0123 = vmulq_f32(vacc3x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vacc0x4567 = vmulq_f32(vacc0x4567, vscale4567); vacc1x4567 = vmulq_f32(vacc1x4567, vscale4567); vacc2x4567 = vmulq_f32(vacc2x4567, vscale4567); vacc3x4567 = vmulq_f32(vacc3x4567, vscale4567); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vacc2x0123 = vminq_f32(vacc2x0123, vmax); vacc3x0123 = vminq_f32(vacc3x0123, vmax); vacc0x4567 = vminq_f32(vacc0x4567, vmax); vacc1x4567 = vminq_f32(vacc1x4567, vmax); vacc2x4567 = vminq_f32(vacc2x4567, vmax); vacc3x4567 = vminq_f32(vacc3x4567, vmax); const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc2x0123 = vmaxq_f32(vacc2x0123, vmin); vacc3x0123 = vmaxq_f32(vacc3x0123, vmin); vacc0x4567 = vmaxq_f32(vacc0x4567, vmin); vacc1x4567 = vmaxq_f32(vacc1x4567, vmin); vacc2x4567 = vmaxq_f32(vacc2x4567, vmin); vacc3x4567 = vmaxq_f32(vacc3x4567, vmin); if XNN_LIKELY(nc >= 8) { vst1q_f32(c3, vacc3x0123); vst1q_f32(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); vst1q_f32(c2, vacc2x0123); vst1q_f32(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); vst1q_f32(c1, vacc1x0123); vst1q_f32(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); vst1q_f32(c0, vacc0x0123); vst1q_f32(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1q_f32(c3, vacc3x0123); c3 += 4; vst1q_f32(c2, vacc2x0123); c2 += 4; vst1q_f32(c1, vacc1x0123); c1 += 4; vst1q_f32(c0, vacc0x0123); c0 += 4; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; } float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); if (nc & 2) { vst1_f32(c3, vacc3x01); c3 += 2; vst1_f32(c2, vacc2x01); c2 += 2; vst1_f32(c1, vacc1x01); c1 += 2; vst1_f32(c0, vacc0x01); c0 += 2; vacc3x01 = vget_high_f32(vacc3x0123); vacc2x01 = vget_high_f32(vacc2x0123); vacc1x01 = vget_high_f32(vacc1x0123); vacc0x01 = vget_high_f32(vacc0x0123); } if (nc & 1) { vst1_lane_f32(c3, vacc3x01, 0); vst1_lane_f32(c2, vacc2x01, 0); vst1_lane_f32(c1, vacc1x01, 0); vst1_lane_f32(c0, vacc0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qs8_vcvt_ukernel__neon_x32( size_t batch, const float* input, int8_t* output, const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vscale = vld1q_dup_f32(¶ms->neon.scale); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon.magic_bias); const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->neon.magic_bias_less_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vx89AB = vld1q_f32(input); input += 4; float32x4_t vxCDEF = vld1q_f32(input); input += 4; float32x4_t vxGHIJ = vld1q_f32(input); input += 4; float32x4_t vxKLMN = vld1q_f32(input); input += 4; float32x4_t vxOPQR = vld1q_f32(input); input += 4; float32x4_t vxSTUV = vld1q_f32(input); input += 4; vx0123 = vmulq_f32(vx0123, vscale); vx4567 = vmulq_f32(vx4567, vscale); vx89AB = vmulq_f32(vx89AB, vscale); vxCDEF = vmulq_f32(vxCDEF, vscale); vxGHIJ = vmulq_f32(vxGHIJ, vscale); vxKLMN = vmulq_f32(vxKLMN, vscale); vxOPQR = vmulq_f32(vxOPQR, vscale); vxSTUV = vmulq_f32(vxSTUV, vscale); vx0123 = vaddq_f32(vx0123, vmagic_bias); vx4567 = vaddq_f32(vx4567, vmagic_bias); vx89AB = vaddq_f32(vx89AB, vmagic_bias); vxCDEF = vaddq_f32(vxCDEF, vmagic_bias); vxGHIJ = vaddq_f32(vxGHIJ, vmagic_bias); vxKLMN = vaddq_f32(vxKLMN, vmagic_bias); vxOPQR = vaddq_f32(vxOPQR, vmagic_bias); vxSTUV = vaddq_f32(vxSTUV, vmagic_bias); const int32x4_t vacc0123 = vqsubq_s32(vreinterpretq_s32_f32(vx0123), vmagic_bias_less_zero_point); const int32x4_t vacc4567 = vqsubq_s32(vreinterpretq_s32_f32(vx4567), vmagic_bias_less_zero_point); const int32x4_t vacc89AB = vqsubq_s32(vreinterpretq_s32_f32(vx89AB), vmagic_bias_less_zero_point); const int32x4_t vaccCDEF = vqsubq_s32(vreinterpretq_s32_f32(vxCDEF), vmagic_bias_less_zero_point); const int32x4_t vaccGHIJ = vqsubq_s32(vreinterpretq_s32_f32(vxGHIJ), vmagic_bias_less_zero_point); const int32x4_t vaccKLMN = vqsubq_s32(vreinterpretq_s32_f32(vxKLMN), vmagic_bias_less_zero_point); const int32x4_t vaccOPQR = vqsubq_s32(vreinterpretq_s32_f32(vxOPQR), vmagic_bias_less_zero_point); const int32x4_t vaccSTUV = vqsubq_s32(vreinterpretq_s32_f32(vxSTUV), vmagic_bias_less_zero_point); const int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); const int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); const int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); const int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); int8x16_t vy0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x16_t vyGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); vy0123456789ABCDEF = vmaxq_s8(vy0123456789ABCDEF, voutput_min); vyGHIJKLMNOPQRSTUV = vmaxq_s8(vyGHIJKLMNOPQRSTUV, voutput_min); vy0123456789ABCDEF = vminq_s8(vy0123456789ABCDEF, voutput_max); vyGHIJKLMNOPQRSTUV = vminq_s8(vyGHIJKLMNOPQRSTUV, voutput_max); vst1q_s8(output, vy0123456789ABCDEF); output += 16; vst1q_s8(output, vyGHIJKLMNOPQRSTUV); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; float32x4_t vx_hi = vld1q_f32(input); input += 4; vx_lo = vmulq_f32(vx_lo, vscale); vx_hi = vmulq_f32(vx_hi, vscale); vx_lo = vaddq_f32(vx_lo, vmagic_bias); vx_hi = vaddq_f32(vx_hi, vmagic_bias); const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point); const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point); const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); vy = vmax_s8(vy, vget_low_s8(voutput_min)); vy = vmin_s8(vy, vget_low_s8(voutput_max)); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); float32x4_t vx_lo = vld1q_f32(input); const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float)))); float32x4_t vx_hi = vld1q_f32(x_hi); vx_lo = vmulq_f32(vx_lo, vscale); vx_hi = vmulq_f32(vx_hi, vscale); vx_lo = vaddq_f32(vx_lo, vmagic_bias); vx_hi = vaddq_f32(vx_hi, vmagic_bias); const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point); const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point); const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); int8x8_t vy = vqmovn_s16(vacc); vy = vmax_s8(vy, vget_low_s8(voutput_min)); vy = vmin_s8(vy, vget_low_s8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(float))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(float))) { vst1_lane_s8(output, vy, 0); } } } void xnn_f32_qu8_vcvt_ukernel__neon_x32( size_t batch, const float* input, uint8_t* output, const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vscale = vld1q_dup_f32(¶ms->neon.scale); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon.magic_bias); const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->neon.magic_bias_less_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vx89AB = vld1q_f32(input); input += 4; float32x4_t vxCDEF = vld1q_f32(input); input += 4; float32x4_t vxGHIJ = vld1q_f32(input); input += 4; float32x4_t vxKLMN = vld1q_f32(input); input += 4; float32x4_t vxOPQR = vld1q_f32(input); input += 4; float32x4_t vxSTUV = vld1q_f32(input); input += 4; vx0123 = vmulq_f32(vx0123, vscale); vx4567 = vmulq_f32(vx4567, vscale); vx89AB = vmulq_f32(vx89AB, vscale); vxCDEF = vmulq_f32(vxCDEF, vscale); vxGHIJ = vmulq_f32(vxGHIJ, vscale); vxKLMN = vmulq_f32(vxKLMN, vscale); vxOPQR = vmulq_f32(vxOPQR, vscale); vxSTUV = vmulq_f32(vxSTUV, vscale); vx0123 = vaddq_f32(vx0123, vmagic_bias); vx4567 = vaddq_f32(vx4567, vmagic_bias); vx89AB = vaddq_f32(vx89AB, vmagic_bias); vxCDEF = vaddq_f32(vxCDEF, vmagic_bias); vxGHIJ = vaddq_f32(vxGHIJ, vmagic_bias); vxKLMN = vaddq_f32(vxKLMN, vmagic_bias); vxOPQR = vaddq_f32(vxOPQR, vmagic_bias); vxSTUV = vaddq_f32(vxSTUV, vmagic_bias); const int32x4_t vacc0123 = vqsubq_s32(vreinterpretq_s32_f32(vx0123), vmagic_bias_less_zero_point); const int32x4_t vacc4567 = vqsubq_s32(vreinterpretq_s32_f32(vx4567), vmagic_bias_less_zero_point); const int32x4_t vacc89AB = vqsubq_s32(vreinterpretq_s32_f32(vx89AB), vmagic_bias_less_zero_point); const int32x4_t vaccCDEF = vqsubq_s32(vreinterpretq_s32_f32(vxCDEF), vmagic_bias_less_zero_point); const int32x4_t vaccGHIJ = vqsubq_s32(vreinterpretq_s32_f32(vxGHIJ), vmagic_bias_less_zero_point); const int32x4_t vaccKLMN = vqsubq_s32(vreinterpretq_s32_f32(vxKLMN), vmagic_bias_less_zero_point); const int32x4_t vaccOPQR = vqsubq_s32(vreinterpretq_s32_f32(vxOPQR), vmagic_bias_less_zero_point); const int32x4_t vaccSTUV = vqsubq_s32(vreinterpretq_s32_f32(vxSTUV), vmagic_bias_less_zero_point); const int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); const int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); const int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); const int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); uint8x16_t vy0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x16_t vyGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); vy0123456789ABCDEF = vmaxq_u8(vy0123456789ABCDEF, voutput_min); vyGHIJKLMNOPQRSTUV = vmaxq_u8(vyGHIJKLMNOPQRSTUV, voutput_min); vy0123456789ABCDEF = vminq_u8(vy0123456789ABCDEF, voutput_max); vyGHIJKLMNOPQRSTUV = vminq_u8(vyGHIJKLMNOPQRSTUV, voutput_max); vst1q_u8(output, vy0123456789ABCDEF); output += 16; vst1q_u8(output, vyGHIJKLMNOPQRSTUV); output += 16; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx_lo = vld1q_f32(input); input += 4; float32x4_t vx_hi = vld1q_f32(input); input += 4; vx_lo = vmulq_f32(vx_lo, vscale); vx_hi = vmulq_f32(vx_hi, vscale); vx_lo = vaddq_f32(vx_lo, vmagic_bias); vx_hi = vaddq_f32(vx_hi, vmagic_bias); const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point); const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point); const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); vy = vmax_u8(vy, vget_low_u8(voutput_min)); vy = vmin_u8(vy, vget_low_u8(voutput_max)); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); float32x4_t vx_lo = vld1q_f32(input); const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float)))); float32x4_t vx_hi = vld1q_f32(x_hi); vx_lo = vmulq_f32(vx_lo, vscale); vx_hi = vmulq_f32(vx_hi, vscale); vx_lo = vaddq_f32(vx_lo, vmagic_bias); vx_hi = vaddq_f32(vx_hi, vmagic_bias); const int32x4_t vacc_lo = vqsubq_s32(vreinterpretq_s32_f32(vx_lo), vmagic_bias_less_zero_point); const int32x4_t vacc_hi = vqsubq_s32(vreinterpretq_s32_f32(vx_hi), vmagic_bias_less_zero_point); const int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); uint8x8_t vy = vqmovun_s16(vacc); vy = vmax_u8(vy, vget_low_u8(voutput_min)); vy = vmin_u8(vy, vget_low_u8(voutput_max)); if (batch & (4 * sizeof(float))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(float))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(float))) { vst1_lane_u8(output, vy, 0); } } } extern XNN_INTERNAL const float xnn_table_exp2_k_over_64[64]; void xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8( size_t batch, const float* input, const float* max, float* output, float* sum, const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(max != NULL); assert(output != NULL); assert(sum != NULL); const float32x4_t vi_max = vld1q_dup_f32(max); const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.log2e); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.magic_bias); const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); const float32x4_t vminus_ln2_hi = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.minus_ln2_hi); const float32x4_t vminus_ln2_lo = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.minus_ln2_lo); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.c2); const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.denorm_cutoff); float32x4_t vacc0 = vmovq_n_f32(0.0f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vi0123 = vld1q_f32(input); input += 4; const float32x4_t vi4567 = vld1q_f32(input); input += 4; const float32x4_t vx0123 = vsubq_f32(vi0123, vi_max); const float32x4_t vx4567 = vsubq_f32(vi4567, vi_max); float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vx0123, vlog2e); float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vx4567, vlog2e); const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x3F))), 17); const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx01]); float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx23]); float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx45]); float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx67]); vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); const float32x4_t vl0123 = vcombine_f32(vl01, vl23); vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); const float32x4_t vl4567 = vcombine_f32(vl45, vl67); const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); float32x4_t vt0123 = vmlaq_f32(vx0123, vn0123, vminus_ln2_hi); float32x4_t vt4567 = vmlaq_f32(vx4567, vn4567, vminus_ln2_hi); vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); float32x4_t vp0123 = vmulq_f32(vt0123, vc2); float32x4_t vp4567 = vmulq_f32(vt4567, vc2); vp0123 = vmlaq_f32(vt0123, vt0123, vp0123); vp4567 = vmlaq_f32(vt4567, vt4567, vp4567); float32x4_t vf0123 = vmlaq_f32(vs0123, vs0123, vp0123); float32x4_t vf4567 = vmlaq_f32(vs4567, vs4567, vp4567); vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff))); vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff))); vst1q_f32(output, vf0123); output += 4; vst1q_f32(output, vf4567); output += 4; vacc0 = vaddq_f32(vacc0, vf0123); vacc0 = vaddq_f32(vacc0, vf4567); } float32x4_t vacc = vacc0; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vi = vld1q_f32(input); input += 4; const float32x4_t vx = vsubq_f32(vi, vi_max); float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); vt = vmlaq_f32(vt, vn, vminus_ln2_lo); float32x4_t vp = vmulq_f32(vt, vc2); vp = vmlaq_f32(vt, vt, vp); float32x4_t vf = vmlaq_f32(vs, vs, vp); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); vst1q_f32(output, vf); output += 4; vacc = vaddq_f32(vacc, vf); } #if XNN_ARCH_ARM64 float vacc_lo = vaddvq_f32(vacc); #else float32x2_t vacc_lo = vadd_f32(vget_high_f32(vacc), vget_low_f32(vacc)); #endif if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 3 * sizeof(float)); const float32x4_t vi = vld1q_f32(input); input += 4; const float32x4_t vx = vsubq_f32(vi, vi_max); float32x4_t vn = vmlaq_f32(vmagic_bias, vx, vlog2e); const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x3F))), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vmlaq_f32(vx, vn, vminus_ln2_hi); vt = vmlaq_f32(vt, vn, vminus_ln2_lo); float32x4_t vp = vmulq_f32(vt, vc2); vp = vmlaq_f32(vt, vt, vp); float32x4_t vf = vmlaq_f32(vs, vs, vp); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff))); float32x2_t vf_lo = vget_low_f32(vf); if (batch & (2 * sizeof(float))) { vst1_f32(output, vf_lo); output += 2; #if XNN_ARCH_ARM64 vacc_lo += vaddv_f32(vf_lo); #else vacc_lo = vadd_f32(vacc_lo, vf_lo); #endif vf_lo = vget_high_f32(vf); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vf_lo, 0); #if XNN_ARCH_ARM64 vacc_lo += vget_lane_f32(vf_lo, 0); #else vacc_lo = vadd_f32(vacc_lo, vreinterpret_f32_u64(vshl_n_u64(vreinterpret_u64_f32(vf_lo), 32))); #endif } } #if XNN_ARCH_ARM64 *sum = vacc_lo; #else vst1_lane_f32(sum, vpadd_f32(vacc_lo, vacc_lo), 0); #endif } void xnn_f32_rmax_ukernel__neon( size_t batch, const float* input, float* output) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); float32x4_t vmax0 = vld1q_dup_f32(input); float32x4_t vmax1 = vmax0; float32x4_t vmax2 = vmax0; float32x4_t vmax3 = vmax0; for (; batch >= 64; batch -= 64) { const float32x4_t vx0 = vld1q_f32(input); input += 4; const float32x4_t vx1 = vld1q_f32(input); input += 4; const float32x4_t vx2 = vld1q_f32(input); input += 4; const float32x4_t vx3 = vld1q_f32(input); input += 4; vmax0 = vmaxq_f32(vmax0, vx0); vmax1 = vmaxq_f32(vmax1, vx1); vmax2 = vmaxq_f32(vmax2, vx2); vmax3 = vmaxq_f32(vmax3, vx3); } float32x4_t vmax = vmaxq_f32(vmaxq_f32(vmax0, vmax1), vmaxq_f32(vmax2, vmax3)); for (; batch >= 16; batch -= 16) { const float32x4_t vx = vld1q_f32(input); input += 4; vmax = vmaxq_f32(vmax, vx); } #if XNN_ARCH_ARM64 float32x2_t vmax_lo = vget_low_f32(vpmaxq_f32(vmax, vmax)); #else float32x2_t vmax_lo = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax)); #endif if XNN_UNLIKELY(batch != 0) { do { const float32x2_t vx = vld1_dup_f32(input); input += 1; vmax_lo = vmax_f32(vmax_lo, vx); batch -= 4; } while (batch != 0); } #if XNN_ARCH_ARM64 *output = vmaxv_f32(vmax_lo); #else vst1_lane_f32(output, vpmax_f32(vmax_lo, vmax_lo), 0); #endif } void xnn_f32_rminmax_ukernel__neon_x16_acc4( size_t batch, const float* input, float* output, const union xnn_f32_default_params* params) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); float32x4_t vmin0 = vld1q_dup_f32(input); float32x4_t vmax0 = vmin0; float32x4_t vmin1 = vmin0; float32x4_t vmax1 = vmax0; float32x4_t vmin2 = vmin0; float32x4_t vmax2 = vmax0; float32x4_t vmin3 = vmin0; float32x4_t vmax3 = vmax0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const float32x4_t vt0 = vld1q_f32(input); input += 4; const float32x4_t vt1 = vld1q_f32(input); input += 4; const float32x4_t vt2 = vld1q_f32(input); input += 4; const float32x4_t vt3 = vld1q_f32(input); input += 4; vmin0 = vminq_f32(vmin0, vt0); vmax0 = vmaxq_f32(vmax0, vt0); vmin1 = vminq_f32(vmin1, vt1); vmax1 = vmaxq_f32(vmax1, vt1); vmin2 = vminq_f32(vmin2, vt2); vmax2 = vmaxq_f32(vmax2, vt2); vmin3 = vminq_f32(vmin3, vt3); vmax3 = vmaxq_f32(vmax3, vt3); } vmin0 = vminq_f32(vmin0, vmin1); vmax0 = vmaxq_f32(vmax0, vmax1); vmin2 = vminq_f32(vmin2, vmin3); vmax2 = vmaxq_f32(vmax2, vmax3); vmin0 = vminq_f32(vmin0, vmin2); vmax0 = vmaxq_f32(vmax0, vmax2); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vt = vld1q_f32(input); input += 4; vmin0 = vminq_f32(vmin0, vt); vmax0 = vmaxq_f32(vmax0, vt); } float32x2_t vmin = vmin_f32(vget_low_f32(vmin0), vget_high_f32(vmin0)); float32x2_t vmax = vmax_f32(vget_low_f32(vmax0), vget_high_f32(vmax0)); if XNN_UNLIKELY(batch & (2 * sizeof(float))) { const float32x2_t vt = vld1_f32(input); input += 2; vmin = vmin_f32(vmin, vt); vmax = vmax_f32(vmax, vt); } vmin = vpmin_f32(vmin, vmin); vmax = vpmax_f32(vmax, vmax); if XNN_UNLIKELY(batch & (1 * sizeof(float))) { const float32x2_t vt = vld1_dup_f32(input); vmin = vmin_f32(vmin, vt); vmax = vmax_f32(vmax, vt); } vst1_lane_f32(output, vmin, 0); vst1_lane_f32(output + 1, vmax, 0); } void xnn_f32_rsum_ukernel__neon_x16_acc4( size_t batch, const float* input, float* output, const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); float32x4_t vacc0 = vmovq_n_f32(0.0f); float32x4_t vacc1 = vmovq_n_f32(0.0f); float32x4_t vacc2 = vmovq_n_f32(0.0f); float32x4_t vacc3 = vmovq_n_f32(0.0f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const float32x4_t vt0 = vld1q_f32(input); input += 4; const float32x4_t vt1 = vld1q_f32(input); input += 4; const float32x4_t vt2 = vld1q_f32(input); input += 4; const float32x4_t vt3 = vld1q_f32(input); input += 4; vacc0 = vaddq_f32(vacc0, vt0); vacc1 = vaddq_f32(vacc1, vt1); vacc2 = vaddq_f32(vacc2, vt2); vacc3 = vaddq_f32(vacc3, vt3); } vacc0 = vaddq_f32(vacc0, vacc1); vacc2 = vaddq_f32(vacc2, vacc3); vacc0 = vaddq_f32(vacc0, vacc2); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vt = vld1q_f32(input); input += 4; vacc0 = vaddq_f32(vacc0, vt); } const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(float))) { const float32x2_t vt = vld1_f32(input); input += 2; vacc = vadd_f32(vacc, vt); } vacc = vpadd_f32(vacc, vacc); if XNN_UNLIKELY(batch & (1 * sizeof(float))) { const float32x2_t vt = vld1_dup_f32(input); vacc = vadd_f32(vacc, vt); } vacc = vmul_f32(vacc, vscale); vst1_lane_f32(output, vacc, 0); } void xnn_f32_spmm_minmax_ukernel_32x1__neon( size_t mc, size_t nc, const float* input, const float* weights, const int32_t* widx_dmap, const uint32_t* nidx_nnzmap, float* output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mc != 0); assert(mc % sizeof(float) == 0); assert(nc != 0); #if XNN_ARCH_ARM64 const float32x4x2_t vminmax = vld2q_dup_f32(¶ms->scalar.min); const float32x4_t vmin = vminmax.val[0]; const float32x4_t vmax = vminmax.val[1]; #else const float32x2x2_t vminmax = vld2_dup_f32(¶ms->scalar.min); const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]); const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]); #endif size_t output_decrement = output_stride * nc - 32 * sizeof(float); while XNN_LIKELY(mc >= 32 * sizeof(float)) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; float32x4_t vacc4567 = vacc0123; float32x4_t vacc89AB = vacc0123; float32x4_t vaccCDEF = vacc0123; float32x4_t vaccGHIJ = vacc0123; float32x4_t vaccKLMN = vacc0123; float32x4_t vaccOPQR = vacc0123; float32x4_t vaccSTUV = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); const float32x4_t vi4567 = vld1q_f32(input + 4); const float32x4_t vi89AB = vld1q_f32(input + 8); const float32x4_t viCDEF = vld1q_f32(input + 12); const float32x4_t viGHIJ = vld1q_f32(input + 16); const float32x4_t viKLMN = vld1q_f32(input + 20); const float32x4_t viOPQR = vld1q_f32(input + 24); const float32x4_t viSTUV = vld1q_f32(input + 28); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); xnn_prefetch_to_l1(input + 16); xnn_prefetch_to_l1(input + 32); const float32x4_t vw = vld1q_dup_f32(w); w += 1; xnn_prefetch_to_l1(w + 32); vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); vacc4567 = vmlaq_f32(vacc4567, vi4567, vw); vacc89AB = vmlaq_f32(vacc89AB, vi89AB, vw); vaccCDEF = vmlaq_f32(vaccCDEF, viCDEF, vw); vaccGHIJ = vmlaq_f32(vaccGHIJ, viGHIJ, vw); vaccKLMN = vmlaq_f32(vaccKLMN, viKLMN, vw); vaccOPQR = vmlaq_f32(vaccOPQR, viOPQR, vw); vaccSTUV = vmlaq_f32(vaccSTUV, viSTUV, vw); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax); float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax); float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax); float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vout89AB = vmaxq_f32(vout89AB, vmin); voutCDEF = vmaxq_f32(voutCDEF, vmin); voutGHIJ = vmaxq_f32(voutGHIJ, vmin); voutKLMN = vmaxq_f32(voutKLMN, vmin); voutOPQR = vmaxq_f32(voutOPQR, vmin); voutSTUV = vmaxq_f32(voutSTUV, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); vst1q_f32(output + 8, vout89AB); vst1q_f32(output + 12, voutCDEF); vst1q_f32(output + 16, voutGHIJ); vst1q_f32(output + 20, voutKLMN); vst1q_f32(output + 24, voutOPQR); vst1q_f32(output + 28, voutSTUV); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 32; mc -= 32 * sizeof(float); } if XNN_UNLIKELY(mc != 0) { output_decrement += 16 * sizeof(float); if (mc & (16 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; float32x4_t vacc4567 = vacc0123; float32x4_t vacc89AB = vacc0123; float32x4_t vaccCDEF = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); const float32x4_t vi4567 = vld1q_f32(input + 4); const float32x4_t vi89AB = vld1q_f32(input + 8); const float32x4_t viCDEF = vld1q_f32(input + 12); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); const float32x4_t vw = vld1q_dup_f32(w); w += 1; vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); vacc4567 = vmlaq_f32(vacc4567, vi4567, vw); vacc89AB = vmlaq_f32(vacc89AB, vi89AB, vw); vaccCDEF = vmlaq_f32(vaccCDEF, viCDEF, vw); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); float32x4_t vout89AB = vminq_f32(vacc89AB, vmax); float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vout89AB = vmaxq_f32(vout89AB, vmin); voutCDEF = vmaxq_f32(voutCDEF, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); vst1q_f32(output + 8, vout89AB); vst1q_f32(output + 12, voutCDEF); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 16; } output_decrement += 8 * sizeof(float); if (mc & (8 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; float32x4_t vacc4567 = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); const float32x4_t vi4567 = vld1q_f32(input + 4); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); const float32x4_t vw = vld1q_dup_f32(w); w += 1; vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); vacc4567 = vmlaq_f32(vacc4567, vi4567, vw); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); float32x4_t vout4567 = vminq_f32(vacc4567, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vout4567 = vmaxq_f32(vout4567, vmin); vst1q_f32(output, vout0123); vst1q_f32(output + 4, vout4567); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 8; } output_decrement += 4 * sizeof(float); if (mc & (4 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x4_t vi0123 = vld1q_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); const float32x4_t vw = vld1q_dup_f32(w); w += 1; vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); } while (--nnz != 0); } float32x4_t vout0123 = vminq_f32(vacc0123, vmax); vout0123 = vmaxq_f32(vout0123, vmin); vst1q_f32(output, vout0123); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 4; } output_decrement += 2 * sizeof(float); if (mc & (2 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x2_t vacc01 = vld1_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x2_t vi01 = vld1_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); const float32x2_t vw = vld1_dup_f32(w); w += 1; vacc01 = vmla_f32(vacc01, vi01, vw); } while (--nnz != 0); } float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); vout01 = vmax_f32(vout01, vget_low_f32(vmin)); vst1_f32(output, vout01); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 2; } output_decrement += 1 * sizeof(float); if (mc & (1 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; float32x2_t vacc0 = vld1_dup_f32(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const float32x2_t vi0 = vld1_dup_f32(input); input = (const float*) ((uintptr_t) input + (uintptr_t) diff); const float32x2_t vw = vld1_dup_f32(w); w += 1; vacc0 = vmla_f32(vacc0, vi0, vw); } while (--nnz != 0); } float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax)); vout0 = vmax_f32(vout0, vget_low_f32(vmin)); vst1_lane_f32(output, vout0, 0); output = (float*) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*) ((uintptr_t) output - output_decrement); input += 1; } } } void xnn_f32_vadd_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vaddq_f32(va0, vb0); float32x4_t vacc1 = vaddq_f32(va1, vb1); vacc0 = vmaxq_f32(vacc0, voutput_min); vacc1 = vmaxq_f32(vacc1, voutput_min); vacc0 = vminq_f32(vacc0, voutput_max); vacc1 = vminq_f32(vacc1, voutput_max); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vaddq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vaddq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vaddc_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vaddq_f32(vacc_, vb); vaccl = vaddq_f32(vaccl, vb); vacc_ = vmaxq_f32(vacc_, voutput_min); vaccl = vmaxq_f32(vaccl, voutput_min); vacc_ = vminq_f32(vacc_, voutput_max); vaccl = vminq_f32(vaccl, voutput_max); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vaddq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vaddq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vmaxq_f32(va0, vb0); float32x4_t vacc1 = vmaxq_f32(va1, vb1); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vmaxq_f32(va, vb); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vmaxq_f32(va, vb); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmaxc_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vmaxq_f32(vacc_, vb); vaccl = vmaxq_f32(vaccl, vb); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vmaxq_f32(va, vb); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vmaxq_f32(va, vb); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmin_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vminq_f32(va0, vb0); float32x4_t vacc1 = vminq_f32(va1, vb1); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vminq_f32(va, vb); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vminq_f32(va, vb); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vminc_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vminq_f32(vacc_, vb); vaccl = vminq_f32(vaccl, vb); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vminq_f32(va, vb); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vminq_f32(va, vb); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmul_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vmulq_f32(va0, vb0); float32x4_t vacc1 = vmulq_f32(va1, vb1); vacc0 = vmaxq_f32(vacc0, voutput_min); vacc1 = vmaxq_f32(vacc1, voutput_min); vacc0 = vminq_f32(vacc0, voutput_max); vacc1 = vminq_f32(vacc1, voutput_max); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vmulq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vmulq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmulc_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vmulq_f32(vacc_, vb); vaccl = vmulq_f32(vaccl, vb); vacc_ = vmaxq_f32(vacc_, voutput_min); vaccl = vmaxq_f32(vaccl, voutput_min); vacc_ = vminq_f32(vacc_, voutput_max); vaccl = vminq_f32(vaccl, voutput_max); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vmulq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vmulq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vrsubc_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vsubq_f32(vb, vacc_); vaccl = vsubq_f32(vb, vaccl); vacc_ = vmaxq_f32(vacc_, voutput_min); vaccl = vmaxq_f32(vaccl, voutput_min); vacc_ = vminq_f32(vacc_, voutput_max); vaccl = vminq_f32(vaccl, voutput_max); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vsubq_f32(vb, va); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vsubq_f32(vb, va); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vsqrdiff_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vsubq_f32(va0, vb0); float32x4_t vacc1 = vsubq_f32(va1, vb1); vacc0 = vmulq_f32(vacc0, vacc0); vacc1 = vmulq_f32(vacc1, vacc1); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vsqrdiffc_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vsubq_f32(vacc_, vb); vaccl = vsubq_f32(vaccl, vb); vacc_ = vmulq_f32(vacc_, vacc_); vaccl = vmulq_f32(vaccl, vaccl); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vsubq_f32(va, vb); vacc = vmulq_f32(vacc, vacc); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vsub_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb0 = vld1q_f32(input_b); input_b += 4; const float32x4_t va1 = vld1q_f32(input_a); input_a += 4; const float32x4_t vb1 = vld1q_f32(input_b); input_b += 4; float32x4_t vacc0 = vsubq_f32(va0, vb0); float32x4_t vacc1 = vsubq_f32(va1, vb1); vacc0 = vmaxq_f32(vacc0, voutput_min); vacc1 = vmaxq_f32(vacc1, voutput_min); vacc0 = vminq_f32(vacc0, voutput_max); vacc1 = vminq_f32(vacc1, voutput_max); vst1q_f32(output, vacc0); output += 4; vst1q_f32(output, vacc1); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; const float32x4_t vb = vld1q_f32(input_b); input_b += 4; float32x4_t vacc = vsubq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); const float32x4_t vb = vld1q_f32(input_b); float32x4_t vacc = vsubq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vsubc_minmax_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); const float32x4_t vb = vld1q_dup_f32(input_b); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc_ = vld1q_f32(input_a); input_a += 4; float32x4_t vaccl = vld1q_f32(input_a); input_a += 4; vacc_ = vsubq_f32(vacc_, vb); vaccl = vsubq_f32(vaccl, vb); vacc_ = vmaxq_f32(vacc_, voutput_min); vaccl = vmaxq_f32(vaccl, voutput_min); vacc_ = vminq_f32(vacc_, voutput_max); vaccl = vminq_f32(vaccl, voutput_max); vst1q_f32(output, vacc_); output += 4; vst1q_f32(output, vaccl); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t va = vld1q_f32(input_a); input_a += 4; float32x4_t vacc = vsubq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t va = vld1q_f32(input_a); float32x4_t vacc = vsubq_f32(va, vb); vacc = vmaxq_f32(vacc, voutput_min); vacc = vminq_f32(vacc, voutput_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vclamp_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vy_min = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vy_max = vld1q_dup_f32(¶ms->scalar.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vacc0123 = vld1q_f32(input); input += 4; float32x4_t vacc4567 = vld1q_f32(input); input += 4; vacc0123 = vmaxq_f32(vacc0123, vy_min); vacc4567 = vmaxq_f32(vacc4567, vy_min); vacc0123 = vminq_f32(vacc0123, vy_max); vacc4567 = vminq_f32(vacc4567, vy_max); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { float32x4_t vacc = vld1q_f32(input); input += 4; vacc = vmaxq_f32(vacc, vy_min); vacc = vminq_f32(vacc, vy_max); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { float32x4_t vacc = vld1q_f32(input); vacc = vmaxq_f32(vacc, vy_min); vacc = vminq_f32(vacc, vy_max); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vcmul_ukernel__neon_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float* ar = input_a; const float* ai = (const float*) ((uintptr_t) input_a + batch); const float* br = input_b; const float* bi = (const float*) ((uintptr_t) input_b + batch); float* or = output; float* oi = (float*) ((uintptr_t) output + batch); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t va0r = vld1q_f32(ar); ar += 4; const float32x4_t va0i = vld1q_f32(ai); ai += 4; const float32x4_t vb0r = vld1q_f32(br); br += 4; const float32x4_t vb0i = vld1q_f32(bi); bi += 4; const float32x4_t va1r = vld1q_f32(ar); ar += 4; const float32x4_t va1i = vld1q_f32(ai); ai += 4; const float32x4_t vb1r = vld1q_f32(br); br += 4; const float32x4_t vb1i = vld1q_f32(bi); bi += 4; float32x4_t vacc0r = vmulq_f32(va0r, vb0r); float32x4_t vacc0i = vmulq_f32(va0r, vb0i); float32x4_t vacc1r = vmulq_f32(va1r, vb1r); float32x4_t vacc1i = vmulq_f32(va1r, vb1i); vacc0r = vmlsq_f32(vacc0r, va0i, vb0i); vacc0i = vmlaq_f32(vacc0i, va0i, vb0r); vacc1r = vmlsq_f32(vacc1r, va1i, vb1i); vacc1i = vmlaq_f32(vacc1i, va1i, vb1r); vst1q_f32(or, vacc0r); or += 4; vst1q_f32(oi, vacc0i); oi += 4; vst1q_f32(or, vacc1r); or += 4; vst1q_f32(oi, vacc1i); oi += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t var = vld1q_f32(ar); ar += 4; const float32x4_t vai = vld1q_f32(ai); ai += 4; const float32x4_t vbr = vld1q_f32(br); br += 4; const float32x4_t vbi = vld1q_f32(bi); bi += 4; float32x4_t vaccr = vmulq_f32(var, vbr); float32x4_t vacci = vmulq_f32(var, vbi); vaccr = vmlsq_f32(vaccr, vai, vbi); vacci = vmlaq_f32(vacci, vai, vbr); vst1q_f32(or, vaccr); or += 4; vst1q_f32(oi, vacci); oi += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t var = vld1q_f32(ar); ar += 4; const float32x4_t vai = vld1q_f32(ai); ai += 4; const float32x4_t vbr = vld1q_f32(br); br += 4; const float32x4_t vbi = vld1q_f32(bi); bi += 4; float32x4_t vaccr = vmulq_f32(var, vbr); float32x4_t vacci = vmulq_f32(var, vbi); vaccr = vmlsq_f32(vaccr, vai, vbi); vacci = vmlaq_f32(vacci, vai, vbr); float32x2_t vaccr_lo = vget_low_f32(vaccr); float32x2_t vacci_lo = vget_low_f32(vacci); if (batch & (2 * sizeof(float))) { vst1_f32(or, vaccr_lo); or += 2; vst1_f32(oi, vacci_lo); oi += 2; vaccr_lo = vget_high_f32(vaccr); vacci_lo = vget_high_f32(vacci); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(or, vaccr_lo, 0); vst1_lane_f32(oi, vacci_lo, 0); } } } extern XNN_INTERNAL const int32_t xnn_table_exp2minus_k_over_16[16]; void xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8( size_t batch, const float* input, float* output, const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vprescale = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.prescale); const float32x4_t valpha = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.alpha); const float32x4_t vbeta = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.beta); const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.sat_cutoff); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.magic_bias); const float32x4_t vlog2e = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.log2e); const int32x4_t vindex_mask = vmovq_n_s32(0xF); const float32x4_t vminus_ln2_hi = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.minus_ln2_hi); const float32x4_t vminus_ln2_lo = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.minus_ln2_lo); const float32x4_t vc3 = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.c3); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_rr2_lut16_p3.c2); const float32x4_t vone = vmovq_n_f32(1.0f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vz0123 = vmaxq_f32(vmulq_f32(vx0123, vprescale), vsat_cutoff); const float32x4_t vz4567 = vmaxq_f32(vmulq_f32(vx4567, vprescale), vsat_cutoff); float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vlog2e); float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vlog2e); const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask), 2)); const int32x4_t ven0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 19); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask), 2)); const int32x4_t ven4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 19); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); int32x2_t vl01 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01)); int32x2_t vl23 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23)); vl01 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)), vl01, 1); vl23 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)), vl23, 1); const int32x4_t vl0123 = vcombine_s32(vl01, vl23); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); int32x2_t vl45 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45)); int32x2_t vl67 = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67)); vl45 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)), vl45, 1); vl67 = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)), vl67, 1); const int32x4_t vl4567 = vcombine_s32(vl45, vl67); vn0123 = vsubq_f32(vn0123, vmagic_bias); float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vl0123, ven0123)); vn4567 = vsubq_f32(vn4567, vmagic_bias); float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vl4567, ven4567)); float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vminus_ln2_hi); float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vminus_ln2_hi); vt0123 = vmlaq_f32(vt0123, vn0123, vminus_ln2_lo); vt4567 = vmlaq_f32(vt4567, vn4567, vminus_ln2_lo); float32x4_t vp0123 = vmlaq_f32(vc2, vc3, vt0123); float32x4_t vp4567 = vmlaq_f32(vc2, vc3, vt4567); vp0123 = vmulq_f32(vp0123, vt0123); vp4567 = vmulq_f32(vp4567, vt4567); vt0123 = vmulq_f32(vt0123, vs0123); vs0123 = vsubq_f32(vs0123, vone); vt4567 = vmulq_f32(vt4567, vs4567); vs4567 = vsubq_f32(vs4567, vone); vp0123 = vmlaq_f32(vt0123, vp0123, vt0123); vp4567 = vmlaq_f32(vt4567, vp4567, vt4567); const float32x4_t ve0123 = vmulq_f32(vaddq_f32(vp0123, vs0123), valpha); const float32x4_t ve4567 = vmulq_f32(vaddq_f32(vp4567, vs4567), valpha); const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); vx0123 = vmulq_f32(vx0123, vbeta); const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); vx4567 = vmulq_f32(vx4567, vbeta); const float32x4_t vy0123 = vbslq_f32(vm0123, ve0123, vx0123); const float32x4_t vy4567 = vbslq_f32(vm4567, ve4567, vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e); const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); vn = vsubq_f32(vn, vmagic_bias); const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); float32x4_t vt = vmlaq_f32(vz, vn, vminus_ln2_hi); vt = vmlaq_f32(vt, vn, vminus_ln2_lo); float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); float32x4_t vp = vmlaq_f32(vc2, vc3, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vmlaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { float32x4_t vx = vld1q_f32(input); const float32x4_t vz = vmaxq_f32(vmulq_f32(vx, vprescale), vsat_cutoff); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vlog2e); const uint64x2_t vidx = vreinterpretq_u64_s32(vshlq_n_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask), 2)); const int32x4_t ven = vshlq_n_s32(vreinterpretq_s32_f32(vn), 19); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); int32x2_t vl_lo = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)); int32x2_t vl_hi = vld1_dup_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)); vl_lo = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)), vl_lo, 1); vl_hi = vld1_lane_s32((const int32_t*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)), vl_hi, 1); vn = vsubq_f32(vn, vmagic_bias); const int32x4_t vl = vcombine_s32(vl_lo, vl_hi); float32x4_t vt = vmlaq_f32(vz, vn, vminus_ln2_hi); vt = vmlaq_f32(vt, vn, vminus_ln2_lo); float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vl, ven)); float32x4_t vp = vmlaq_f32(vc2, vc3, vt); vp = vmulq_f32(vp, vt); vt = vmulq_f32(vt, vs); vs = vsubq_f32(vs, vone); vp = vmlaq_f32(vt, vp, vt); const float32x4_t ve = vmulq_f32(vaddq_f32(vp, vs), valpha); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vx = vmulq_f32(vx, vbeta); const float32x4_t vy = vbslq_f32(vm, ve, vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vhswish_ukernel__neon_x16( size_t batch, const float* input, float* output, const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth); const float32x4_t vthree = vld1q_dup_f32(¶ms->scalar.three); const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(¶ms->scalar.six)); const int32x4_t vzero = vdupq_n_s32(0); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { float32x4_t vx0123 = vld1q_f32(input); input += 4; float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vx89AB = vld1q_f32(input); input += 4; float32x4_t vxCDEF = vld1q_f32(input); input += 4; float32x4_t vacc0123 = vaddq_f32(vx0123, vthree); vx0123 = vmulq_f32(vx0123, vsixth); float32x4_t vacc4567 = vaddq_f32(vx4567, vthree); vx4567 = vmulq_f32(vx4567, vsixth); float32x4_t vacc89AB = vaddq_f32(vx89AB, vthree); vx89AB = vmulq_f32(vx89AB, vsixth); float32x4_t vaccCDEF = vaddq_f32(vxCDEF, vthree); vxCDEF = vmulq_f32(vxCDEF, vsixth); vacc0123 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc0123), vzero)); vacc4567 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc4567), vzero)); vacc89AB = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc89AB), vzero)); vaccCDEF = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vaccCDEF), vzero)); vacc0123 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc0123), vsix)); vacc4567 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc4567), vsix)); vacc89AB = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc89AB), vsix)); vaccCDEF = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vaccCDEF), vsix)); vacc0123 = vmulq_f32(vacc0123, vx0123); vacc4567 = vmulq_f32(vacc4567, vx4567); vacc89AB = vmulq_f32(vacc89AB, vx89AB); vaccCDEF = vmulq_f32(vaccCDEF, vxCDEF); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; vst1q_f32(output, vacc89AB); output += 4; vst1q_f32(output, vaccCDEF); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { float32x4_t vx = vld1q_f32(input); input += 4; float32x4_t vacc = vaddq_f32(vx, vthree); vx = vmulq_f32(vx, vsixth); vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero)); vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix)); vacc = vmulq_f32(vacc, vx); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { float32x4_t vx = vld1q_f32(input); float32x4_t vacc = vaddq_f32(vx, vthree); vx = vmulq_f32(vx, vsixth); vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero)); vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix)); vacc = vmulq_f32(vacc, vx); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vlrelu_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vslope = vld1q_dup_f32(¶ms->scalar.slope); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vacc0123 = vmulq_f32(vx0123, vslope); const uint32x4_t vmask0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0)); float32x4_t vacc4567 = vmulq_f32(vx4567, vslope); const uint32x4_t vmask4567 = vcltq_s32(vreinterpretq_s32_f32(vx4567), vmovq_n_s32(0)); vacc0123 = vbslq_f32(vmask0123, vacc0123, vx0123); vacc4567 = vbslq_f32(vmask4567, vacc4567, vx4567); vst1q_f32(output, vacc0123); output += 4; vst1q_f32(output, vacc4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; float32x4_t vacc = vmulq_f32(vx, vslope); const uint32x4_t vmask = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0)); vacc = vbslq_f32(vmask, vacc, vx); vst1q_f32(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); float32x4_t vacc = vmulq_f32(vx, vslope); const uint32x4_t vmask = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0)); vacc = vbslq_f32(vmask, vacc, vx); float32x2_t vacc_lo = vget_low_f32(vacc); if (batch & (2 * sizeof(float))) { vst1_f32(output, vacc_lo); output += 2; vacc_lo = vget_high_f32(vacc); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vacc_lo, 0); } } } void xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const float32x4_t vscale0123 = vld1q_f32(w); w += 4; float32x4_t vacc0x0123 = vld1q_f32(i0); i0 += 4; float32x4_t vacc1x0123 = vld1q_f32(i1); i1 += 4; vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); const float32x4_t vbias0123 = vld1q_f32(w); w += 4; vacc0x0123 = vaddq_f32(vacc0x0123, vbias0123); vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); vst1q_f32(o0, vacc0x0123); o0 += 4; vst1q_f32(o1, vacc1x0123); o1 += 4; } if XNN_UNLIKELY(c != 0) { const float32x4_t vscale0123 = vld1q_f32(w); float32x4_t vacc0x0123 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + c); float32x4_t vacc1x0123 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + c); vacc0x0123 = vmulq_f32(vacc0x0123, vscale0123); vacc1x0123 = vmulq_f32(vacc1x0123, vscale0123); const float32x4_t vbias0123 = vld1q_f32(w + 4); vacc0x0123 = vaddq_f32(vacc0x0123, vbias0123); vacc1x0123 = vaddq_f32(vacc1x0123, vbias0123); vacc0x0123 = vmaxq_f32(vacc0x0123, vmin); vacc1x0123 = vmaxq_f32(vacc1x0123, vmin); vacc0x0123 = vminq_f32(vacc0x0123, vmax); vacc1x0123 = vminq_f32(vacc1x0123, vmax); float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); if (c & (2 * sizeof(float))) { vst1_f32(o0, vacc0x01); o0 += 2; vst1_f32(o1, vacc1x01); o1 += 2; vacc0x01 = vget_high_f32(vacc0x0123); vacc1x01 = vget_high_f32(vacc1x0123); } if (c & (1 * sizeof(float))) { vst1_lane_f32(o0, vacc0x01, 0); o0 += 1; vst1_lane_f32(o1, vacc1x01, 0); o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_vrndd_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000))); const uint32x4_t vone = vreinterpretq_u32_f32(vmovq_n_f32(1.0f)); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123); const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567); uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold); uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold); const float32x4_t vprerndx0123 = vcvtq_f32_s32(vintx0123); const float32x4_t vprerndx4567 = vcvtq_f32_s32(vintx4567); vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000))); vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx0123 = vbslq_f32(vrndmask0123, vprerndx0123, vx0123); const float32x4_t vrndx4567 = vbslq_f32(vrndmask4567, vprerndx4567, vx4567); const uint32x4_t vadjmask0123 = vcgtq_f32(vrndx0123, vx0123); const uint32x4_t vadjmask4567 = vcgtq_f32(vrndx4567, vx4567); const float32x4_t vadjrndx0123 = vreinterpretq_f32_u32(vandq_u32(vadjmask0123, vone)); const float32x4_t vadjrndx4567 = vreinterpretq_f32_u32(vandq_u32(vadjmask4567, vone)); const float32x4_t vy0123 = vsubq_f32(vrndx0123, vadjrndx0123); const float32x4_t vy4567 = vsubq_f32(vrndx4567, vadjrndx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const int32x4_t vintx = vcvtq_s32_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); const float32x4_t vprerndx = vcvtq_f32_s32(vintx); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx); const uint32x4_t vadjmask = vcgtq_f32(vrndx, vx); const float32x4_t vadjrndx = vreinterpretq_f32_u32(vandq_u32(vadjmask, vone)); const float32x4_t vy = vsubq_f32(vrndx, vadjrndx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const int32x4_t vintx = vcvtq_s32_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); const float32x4_t vprerndx = vcvtq_f32_s32(vintx); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx); const uint32x4_t vadjmask = vcgtq_f32(vrndx, vx); const float32x4_t vadjrndx = vreinterpretq_f32_u32(vandq_u32(vadjmask, vone)); const float32x4_t vy = vsubq_f32(vrndx, vadjrndx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vrndne_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vmagic_number = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000))); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vabsx0123 = vabsq_f32(vx0123); uint32x4_t vrndmask0123 = vcaltq_f32(vmagic_number, vx0123); const float32x4_t vabsx4567 = vabsq_f32(vx4567); uint32x4_t vrndmask4567 = vcaltq_f32(vmagic_number, vx4567); float32x4_t vrndabsx0123 = vaddq_f32(vabsx0123, vmagic_number); float32x4_t vrndabsx4567 = vaddq_f32(vabsx4567, vmagic_number); vrndmask0123 = vorrq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000))); vrndmask4567 = vorrq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000))); vrndabsx0123 = vsubq_f32(vrndabsx0123, vmagic_number); vrndabsx4567 = vsubq_f32(vrndabsx4567, vmagic_number); const float32x4_t vy0123 = vbslq_f32(vrndmask0123, vx0123, vrndabsx0123); const float32x4_t vy4567 = vbslq_f32(vrndmask4567, vx4567, vrndabsx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vabsx = vabsq_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vmagic_number, vx); float32x4_t vrndabsx = vaddq_f32(vabsx, vmagic_number); vrndmask = vorrq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); vrndabsx = vsubq_f32(vrndabsx, vmagic_number); const float32x4_t vy = vbslq_f32(vrndmask, vx, vrndabsx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vabsx = vabsq_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vmagic_number, vx); float32x4_t vrndabsx = vaddq_f32(vabsx, vmagic_number); vrndmask = vorrq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); vrndabsx = vsubq_f32(vrndabsx, vmagic_number); const float32x4_t vy = vbslq_f32(vrndmask, vx, vrndabsx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vrndu_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000))); const float32x4_t vone = vmovq_n_f32(1.0f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123); const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567); uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold); uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold); const float32x4_t vprerndx0123 = vcvtq_f32_s32(vintx0123); const float32x4_t vprerndx4567 = vcvtq_f32_s32(vintx4567); vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000))); vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx0123 = vbslq_f32(vrndmask0123, vprerndx0123, vx0123); const float32x4_t vrndx4567 = vbslq_f32(vrndmask4567, vprerndx4567, vx4567); uint32x4_t vadjmask0123 = vcgeq_f32(vrndx0123, vx0123); uint32x4_t vadjmask4567 = vcgeq_f32(vrndx4567, vx4567); const float32x4_t vadjrndx0123 = vaddq_f32(vrndx0123, vone); const float32x4_t vadjrndx4567 = vaddq_f32(vrndx4567, vone); vadjmask0123 = vorrq_u32(vadjmask0123, vmovq_n_u32(UINT32_C(0x80000000))); vadjmask4567 = vorrq_u32(vadjmask4567, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy0123 = vbslq_f32(vadjmask0123, vrndx0123, vadjrndx0123); const float32x4_t vy4567 = vbslq_f32(vadjmask4567, vrndx4567, vadjrndx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const int32x4_t vintx = vcvtq_s32_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); const float32x4_t vprerndx = vcvtq_f32_s32(vintx); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx); uint32x4_t vadjmask = vcgeq_f32(vrndx, vx); const float32x4_t vadjrndx = vaddq_f32(vrndx, vone); vadjmask = vorrq_u32(vadjmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy = vbslq_f32(vadjmask, vrndx, vadjrndx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const int32x4_t vintx = vcvtq_s32_f32(vx); const float32x4_t vprerndx = vcvtq_f32_s32(vintx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vrndx = vbslq_f32(vrndmask, vprerndx, vx); uint32x4_t vadjmask = vcgeq_f32(vrndx, vx); const float32x4_t vadjrndx = vaddq_f32(vrndx, vone); vadjmask = vorrq_u32(vadjmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy = vbslq_f32(vadjmask, vrndx, vadjrndx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vrndz_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vintegral_threshold = vreinterpretq_f32_u32(vmovq_n_u32(UINT32_C(0x4B000000))); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const int32x4_t vintx0123 = vcvtq_s32_f32(vx0123); const int32x4_t vintx4567 = vcvtq_s32_f32(vx4567); uint32x4_t vrndmask0123 = vcaltq_f32(vx0123, vintegral_threshold); uint32x4_t vrndmask4567 = vcaltq_f32(vx4567, vintegral_threshold); const float32x4_t vrndx0123 = vcvtq_f32_s32(vintx0123); const float32x4_t vrndx4567 = vcvtq_f32_s32(vintx4567); vrndmask0123 = vbicq_u32(vrndmask0123, vmovq_n_u32(UINT32_C(0x80000000))); vrndmask4567 = vbicq_u32(vrndmask4567, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy0123 = vbslq_f32(vrndmask0123, vrndx0123, vx0123); const float32x4_t vy4567 = vbslq_f32(vrndmask4567, vrndx4567, vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const int32x4_t vintx = vcvtq_s32_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); const float32x4_t vrndx = vcvtq_f32_s32(vintx); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy = vbslq_f32(vrndmask, vrndx, vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const int32x4_t vintx = vcvtq_s32_f32(vx); uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold); const float32x4_t vrndx = vcvtq_f32_s32(vintx); vrndmask = vbicq_u32(vrndmask, vmovq_n_u32(UINT32_C(0x80000000))); const float32x4_t vy = vbslq_f32(vrndmask, vrndx, vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64]; void xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8( size_t batch, const float* input, float* output, const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.magic_bias); const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.minus_log2e); const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x3F)); const float32x4_t vln2_hi = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.ln2_hi); const float32x4_t vln2_lo = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.ln2_lo); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.c2); const float32x4_t vone = vmovq_n_f32(1.0f); const float32x4_t vdenorm_cutoff = vld1q_dup_f32(¶ms->neon_rr2_lut64_p2.denorm_cutoff); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vz0123 = vabsq_f32(vx0123); const float32x4_t vz4567 = vabsq_f32(vx4567); float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e); float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e); const int32x4_t ve0123 = vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 17); const int32x4_t ve4567 = vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 17); // Use bits 0:6 bits of batch, as integer, as an index for table lookup of l := 2**(batch % 64). const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask)); const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask)); const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0); const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1); float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx01]); float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx23]); const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0); const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1); float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx45]); float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx67]); vl01 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx01 >> 32)], vl01, 1); vl23 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx23 >> 32)], vl23, 1); const float32x4_t vl0123 = vcombine_f32(vl01, vl23); vl45 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx45 >> 32)], vl45, 1); vl67 = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx67 >> 32)], vl67, 1); const float32x4_t vl4567 = vcombine_f32(vl45, vl67); const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi); float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi); vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo); vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo); float32x4_t vp0123 = vmulq_f32(vt0123, vc2); float32x4_t vp4567 = vmulq_f32(vt4567, vc2); vp0123 = vmlsq_f32(vt0123, vp0123, vt0123); vp4567 = vmlsq_f32(vt4567, vp4567, vt4567); const float32x4_t vy0123 = vmlsq_f32(vs0123, vs0123, vp0123); const float32x4_t vy4567 = vmlsq_f32(vs4567, vs4567, vp4567); const float32x4_t vd0123 = vaddq_f32(vy0123, vone); const float32x4_t vd4567 = vaddq_f32(vy4567, vone); float32x4_t vr0123 = vrecpeq_f32(vd0123); float32x4_t vr4567 = vrecpeq_f32(vd4567); vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123)); vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567)); float32x4_t vf0123 = vmulq_f32(vy0123, vr0123); float32x4_t vf4567 = vmulq_f32(vy4567, vr4567); vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff))); vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff))); const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f)); const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f)); vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123)); vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567)); vst1q_f32(output, vf0123); output += 4; vst1q_f32(output, vf4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vz = vabsq_f32(vx); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi); vt = vmlaq_f32(vt, vn, vln2_lo); float32x4_t vp = vmulq_f32(vt, vc2); vp = vmlsq_f32(vt, vp, vt); const float32x4_t vy = vmlsq_f32(vs, vs, vp); const float32x4_t vd = vaddq_f32(vy, vone); float32x4_t vr = vrecpeq_f32(vd); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); float32x4_t vf = vmulq_f32(vy, vr); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); vst1q_f32(output, vf); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vz = vabsq_f32(vx); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); const int32x4_t ve = vshlq_n_s32(vreinterpretq_s32_f32(vn), 17); const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask)); const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0); const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1); float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_lo]); float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) vidx_hi]); vl_lo = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_lo >> 32)], vl_lo, 1); vl_hi = vld1_lane_f32(&xnn_table_exp2minus_k_over_64[(uint32_t) (vidx_hi >> 32)], vl_hi, 1); const float32x4_t vl = vcombine_f32(vl_lo, vl_hi); const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve)); vn = vsubq_f32(vn, vmagic_bias); float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi); vt = vmlaq_f32(vt, vn, vln2_lo); float32x4_t vp = vmulq_f32(vt, vc2); vp = vmlsq_f32(vt, vp, vt); const float32x4_t vy = vmlsq_f32(vs, vs, vp); const float32x4_t vd = vaddq_f32(vy, vone); float32x4_t vr = vrecpeq_f32(vd); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); vr = vmulq_f32(vr, vrecpsq_f32(vr, vd)); float32x4_t vf = vmulq_f32(vy, vr); vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff))); const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f)); vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf)); float32x2_t vf_lo = vget_low_f32(vf); if (batch & (2 * sizeof(float))) { vst1_f32(output, vf_lo); output += 2; vf_lo = vget_high_f32(vf); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vf_lo, 0); } } } void xnn_f32_vtanh_ukernel__neon_expm1minus_rr1_p6h5ts_nr2recps_x8( size_t batch, const float* input, float* output, const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { const float32x4_t vsat_cutoff = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.sat_cutoff); const float32x4_t vminus_log2e = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.minus_log2e); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.magic_bias); const float32x4_t vln2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.ln2); const float32x4_t vc6 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c6); const float32x4_t vc5 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c5); const float32x4_t vc4 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c4); const float32x4_t vc3 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c3); const float32x4_t vc2 = vld1q_dup_f32(¶ms->neon_expm1minus_rr1_p6h5.c2); const float32x4_t vone = vmovq_n_f32(1.0f); const float32x4_t vtwo = vmovq_n_f32(2.0f); const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; float32x4_t vz0123 = vabsq_f32(vx0123); float32x4_t vz4567 = vabsq_f32(vx4567); vz0123 = vminq_f32(vz0123, vsat_cutoff); vz4567 = vminq_f32(vz4567, vsat_cutoff); float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e); float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e); const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23)); const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23)); vn0123 = vsubq_f32(vn0123, vmagic_bias); vn4567 = vsubq_f32(vn4567, vmagic_bias); const float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2); const float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2); float32x4_t vp0123 = vmlaq_f32(vc5, vc6, vt0123); float32x4_t vp4567 = vmlaq_f32(vc5, vc6, vt4567); vp0123 = vmlaq_f32(vc4, vp0123, vt0123); vp0123 = vmlaq_f32(vc3, vp0123, vt0123); vp0123 = vmlaq_f32(vc2, vp0123, vt0123); vp4567 = vmlaq_f32(vc4, vp4567, vt4567); vp4567 = vmlaq_f32(vc3, vp4567, vt4567); vp4567 = vmlaq_f32(vc2, vp4567, vt4567); vp0123 = vmlsq_f32(vtwo, vp0123, vt0123); vp4567 = vmlsq_f32(vtwo, vp4567, vt4567); const float32x4_t vts0123 = vmulq_f32(vt0123, vs0123); const float32x4_t vsmo0123 = vsubq_f32(vs0123, vone); const float32x4_t vts4567 = vmulq_f32(vt4567, vs4567); const float32x4_t vsmo4567 = vsubq_f32(vs4567, vone); const float32x4_t vemo0123 = vmlsq_f32(vsmo0123, vp0123, vts0123); const float32x4_t vemo4567 = vmlsq_f32(vsmo4567, vp4567, vts4567); const float32x4_t vepo0123 = vaddq_f32(vemo0123, vtwo); const float32x4_t vepo4567 = vaddq_f32(vemo4567, vtwo); float32x4_t vrepo0123 = vrecpeq_f32(vepo0123); float32x4_t vrepo4567 = vrecpeq_f32(vepo4567); float32x4_t verepo0123 = vrecpsq_f32(vrepo0123, vepo0123); float32x4_t verepo4567 = vrecpsq_f32(vrepo4567, vepo4567); vrepo0123 = vmulq_f32(vrepo0123, verepo0123); vrepo4567 = vmulq_f32(vrepo4567, verepo4567); verepo0123 = vrecpsq_f32(vrepo0123, vepo0123); verepo4567 = vrecpsq_f32(vrepo4567, vepo4567); vrepo0123 = vmulq_f32(vrepo0123, verepo0123); vrepo4567 = vmulq_f32(vrepo4567, verepo4567); float32x4_t vy0123 = vmulq_f32(vemo0123, vrepo0123); float32x4_t vy4567 = vmulq_f32(vemo4567, vrepo4567); vy0123 = vbslq_f32(vsign_mask, vx0123, vy0123); vy4567 = vbslq_f32(vsign_mask, vx4567, vy4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; float32x4_t vz = vabsq_f32(vx); vz = vminq_f32(vz, vsat_cutoff); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); const float32x4_t vt = vmlaq_f32(vz, vn, vln2); float32x4_t vp = vmlaq_f32(vc5, vc6, vt); vp = vmlaq_f32(vc4, vp, vt); vp = vmlaq_f32(vc3, vp, vt); vp = vmlaq_f32(vc2, vp, vt); vp = vmlsq_f32(vtwo, vp, vt); const float32x4_t vts = vmulq_f32(vt, vs); const float32x4_t vsmo = vsubq_f32(vs, vone); const float32x4_t vemo = vmlsq_f32(vsmo, vp, vts); const float32x4_t vepo = vaddq_f32(vemo, vtwo); float32x4_t vrepo = vrecpeq_f32(vepo); float32x4_t verepo = vrecpsq_f32(vrepo, vepo); vrepo = vmulq_f32(vrepo, verepo); verepo = vrecpsq_f32(vrepo, vepo); vrepo = vmulq_f32(vrepo, verepo); float32x4_t vy = vmulq_f32(vemo, vrepo); vy = vbslq_f32(vsign_mask, vx, vy); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); float32x4_t vz = vabsq_f32(vx); vz = vminq_f32(vz, vsat_cutoff); float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e); const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23)); vn = vsubq_f32(vn, vmagic_bias); const float32x4_t vt = vmlaq_f32(vz, vn, vln2); float32x4_t vp = vmlaq_f32(vc5, vc6, vt); vp = vmlaq_f32(vc4, vp, vt); vp = vmlaq_f32(vc3, vp, vt); vp = vmlaq_f32(vc2, vp, vt); vp = vmlsq_f32(vtwo, vp, vt); const float32x4_t vts = vmulq_f32(vt, vs); const float32x4_t vsmo = vsubq_f32(vs, vone); const float32x4_t vemo = vmlsq_f32(vsmo, vp, vts); const float32x4_t vepo = vaddq_f32(vemo, vtwo); float32x4_t vrepo = vrecpeq_f32(vepo); float32x4_t verepo = vrecpsq_f32(vrepo, vepo); vrepo = vmulq_f32(vrepo, verepo); verepo = vrecpsq_f32(vrepo, vepo); vrepo = vmulq_f32(vrepo, verepo); float32x4_t vy = vmulq_f32(vemo, vrepo); vy = vbslq_f32(vsign_mask, vx, vy); float32x2_t vy_low = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_low); output += 2; vy_low = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_low, 0); } } } void xnn_f32_vabs_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vy0123 = vabsq_f32(vx0123); const float32x4_t vy4567 = vabsq_f32(vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vy = vabsq_f32(vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vy = vabsq_f32(vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vneg_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vy0123 = vnegq_f32(vx0123); const float32x4_t vy4567 = vnegq_f32(vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vy = vnegq_f32(vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vy = vnegq_f32(vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_f32_vsqr_ukernel__neon_x8( size_t batch, const float* input, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const float32x4_t vx0123 = vld1q_f32(input); input += 4; const float32x4_t vx4567 = vld1q_f32(input); input += 4; const float32x4_t vy0123 = vmulq_f32(vx0123, vx0123); const float32x4_t vy4567 = vmulq_f32(vx4567, vx4567); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const float32x4_t vx = vld1q_f32(input); input += 4; const float32x4_t vy = vmulq_f32(vx, vx); vst1q_f32(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const float32x4_t vx = vld1q_f32(input); const float32x4_t vy = vmulq_f32(vx, vx); float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(float))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(float))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; float* c0 = c; do { const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4; const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int32x4_t vksum89AB = vld1q_s32(w); w = (const int32_t*) w + 4; const int32x4_t vksumCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int32x4_t vzp0 = vld1q_dup_s32(&quantization_params[0].zero_point); int32x4_t vacc0x0123 = vmulq_s32(vksum0123, vzp0); int32x4_t vacc0x4567 = vmulq_s32(vksum4567, vzp0); int32x4_t vacc0x89AB = vmulq_s32(vksum89AB, vzp0); int32x4_t vacc0xCDEF = vmulq_s32(vksumCDEF, vzp0); size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); } } } } } } } float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567); float32x4_t vout0x89AB = vcvtq_f32_s32(vacc0x89AB); float32x4_t vout0xCDEF = vcvtq_f32_s32(vacc0xCDEF); const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbscale89AB = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbscaleCDEF = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbias89AB = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vbiasCDEF = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vinput_scale0 = vld1q_dup_f32(&quantization_params[0].inv_scale); const float32x4_t vscale0x0123 = vmulq_f32(vbscale0123, vinput_scale0); const float32x4_t vscale0x4567 = vmulq_f32(vbscale4567, vinput_scale0); const float32x4_t vscale0x89AB = vmulq_f32(vbscale89AB, vinput_scale0); const float32x4_t vscale0xCDEF = vmulq_f32(vbscaleCDEF, vinput_scale0); #if XNN_ARCH_ARM64 vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123); vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567); vout0x89AB = vfmaq_f32(vbias89AB, vout0x89AB, vscale0x89AB); vout0xCDEF = vfmaq_f32(vbiasCDEF, vout0xCDEF, vscale0xCDEF); #else vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123); vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567); vout0x89AB = vmlaq_f32(vbias89AB, vout0x89AB, vscale0x89AB); vout0xCDEF = vmlaq_f32(vbiasCDEF, vout0xCDEF, vscale0xCDEF); #endif const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); vout0x0123 = vmaxq_f32(vout0x0123, vmin); vout0x4567 = vmaxq_f32(vout0x4567, vmin); vout0x89AB = vmaxq_f32(vout0x89AB, vmin); vout0xCDEF = vmaxq_f32(vout0xCDEF, vmin); vout0x0123 = vminq_f32(vout0x0123, vmax); vout0x4567 = vminq_f32(vout0x4567, vmax); vout0x89AB = vminq_f32(vout0x89AB, vmax); vout0xCDEF = vminq_f32(vout0xCDEF, vmax); if XNN_LIKELY(nc >= 16) { vst1q_f32(&c0[0], vout0x0123); vst1q_f32(&c0[4], vout0x4567); vst1q_f32(&c0[8], vout0x89AB); vst1q_f32(&c0[12], vout0xCDEF); a0 = (const int8_t*) ((uintptr_t) a0 - kc); c0 = (float*) ((uintptr_t) c0 + cn_stride); nc -= 16; } else { if (nc & 8) { vst1q_f32(c0, vout0x0123); c0 += 4; vout0x0123 = vout0x89AB; vst1q_f32(c0, vout0x4567); c0 += 4; vout0x4567 = vout0xCDEF; } if (nc & 4) { vst1q_f32(c0, vout0x0123); c0 += 4; vout0x0123 = vout0x4567; } float32x2_t vout0x01 = vget_low_f32(vout0x0123); if (nc & 2) { vst1_f32(c0, vout0x01); c0 += 2; vout0x01 = vget_high_f32(vout0x0123); } if (nc & 1) { vst1_lane_f32(c0, vout0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; float* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { const int32x4_t vizp0 = vld1q_dup_s32(&quantization_params[0].zero_point); const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x0123 = vmulq_s32(vksum0123, vizp0); const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vmulq_s32(vksum4567, vizp0); size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); } float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567); const float32x4_t vinput_scale0 = vld1q_dup_f32(&quantization_params[0].inv_scale); const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vscale0x0123 = vmulq_f32(vinput_scale0, vbscale0123); const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vscale0x4567 = vmulq_f32(vinput_scale0, vbscale4567); const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4; #if XNN_ARCH_ARM64 vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123); #else vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123); #endif const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4; #if XNN_ARCH_ARM64 vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567); #else vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567); #endif const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); vout0x0123 = vmaxq_f32(vout0x0123, voutput_min); vout0x4567 = vmaxq_f32(vout0x4567, voutput_min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); vout0x0123 = vminq_f32(vout0x0123, voutput_max); vout0x4567 = vminq_f32(vout0x4567, voutput_max); if XNN_LIKELY(nc >= 8) { vst1q_f32(c0, vout0x0123); vst1q_f32(c0 + 0, vout0x0123); vst1q_f32(c0 + 4, vout0x4567); a0 = (const int8_t*) ((uintptr_t) a0 - kc); c0 = (float*) ((uintptr_t) c0 + cn_stride); nc -= 8; } else { if (nc & 4) { vst1q_f32(c0, vout0x0123); c0 += 4; vout0x0123 = vout0x4567; } float32x2_t vout0x01 = vget_low_f32(vout0x0123); if (nc & 2) { vst1_f32(c0, vout0x01); c0 += 2; vout0x01 = vget_high_f32(vout0x0123); } if (nc & 1) { vst1_lane_f32(c0, vout0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 2); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; float* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr != 2) { a1 = a0; c1 = c0; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { const int32x4_t vizp01 = vld1q_s32(&quantization_params[0].zero_point); const int32x4_t vksum0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x0123 = vmulq_lane_s32(vksum0123, vget_low_s32(vizp01), 0); int32x4_t vacc1x0123 = vmulq_lane_s32(vksum0123, vget_high_s32(vizp01), 0); const int32x4_t vksum4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vmulq_lane_s32(vksum4567, vget_low_s32(vizp01), 0); int32x4_t vacc1x4567 = vmulq_lane_s32(vksum4567, vget_high_s32(vizp01), 0); size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; int8x8_t va1x1 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); } float32x4_t vout0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vout0x4567 = vcvtq_f32_s32(vacc0x4567); float32x4_t vout1x0123 = vcvtq_f32_s32(vacc1x0123); float32x4_t vout1x4567 = vcvtq_f32_s32(vacc1x4567); const float32x4_t vinput_scale01 = vreinterpretq_f32_s32(vld1q_s32(&quantization_params[0].zero_point)); const float32x4_t vbscale0123 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vscale0x0123 = vmulq_lane_f32(vbscale0123, vget_low_f32(vinput_scale01), 1); const float32x4_t vscale1x0123 = vmulq_lane_f32(vbscale0123, vget_high_f32(vinput_scale01), 1); const float32x4_t vbscale4567 = vld1q_f32(w); w = (const float*) w + 4; const float32x4_t vscale0x4567 = vmulq_lane_f32(vbscale4567, vget_low_f32(vinput_scale01), 1); const float32x4_t vscale1x4567 = vmulq_lane_f32(vbscale4567, vget_high_f32(vinput_scale01), 1); const float32x4_t vbias0123 = vld1q_f32(w); w = (const float*) w + 4; #if XNN_ARCH_ARM64 vout0x0123 = vfmaq_f32(vbias0123, vout0x0123, vscale0x0123); vout1x0123 = vfmaq_f32(vbias0123, vout1x0123, vscale1x0123); #else vout0x0123 = vmlaq_f32(vbias0123, vout0x0123, vscale0x0123); vout1x0123 = vmlaq_f32(vbias0123, vout1x0123, vscale1x0123); #endif const float32x4_t vbias4567 = vld1q_f32(w); w = (const float*) w + 4; #if XNN_ARCH_ARM64 vout0x4567 = vfmaq_f32(vbias4567, vout0x4567, vscale0x4567); vout1x4567 = vfmaq_f32(vbias4567, vout1x4567, vscale1x4567); #else vout0x4567 = vmlaq_f32(vbias4567, vout0x4567, vscale0x4567); vout1x4567 = vmlaq_f32(vbias4567, vout1x4567, vscale1x4567); #endif const float32x4_t voutput_min = vld1q_dup_f32(¶ms->scalar.min); vout0x0123 = vmaxq_f32(vout0x0123, voutput_min); vout0x4567 = vmaxq_f32(vout0x4567, voutput_min); vout1x0123 = vmaxq_f32(vout1x0123, voutput_min); vout1x4567 = vmaxq_f32(vout1x4567, voutput_min); const float32x4_t voutput_max = vld1q_dup_f32(¶ms->scalar.max); vout0x0123 = vminq_f32(vout0x0123, voutput_max); vout0x4567 = vminq_f32(vout0x4567, voutput_max); vout1x0123 = vminq_f32(vout1x0123, voutput_max); vout1x4567 = vminq_f32(vout1x4567, voutput_max); if XNN_LIKELY(nc >= 8) { vst1q_f32(c1, vout1x0123); vst1q_f32(c1 + 0, vout1x0123); vst1q_f32(c1 + 4, vout1x4567); vst1q_f32(c0, vout0x0123); vst1q_f32(c0 + 0, vout0x0123); vst1q_f32(c0 + 4, vout0x4567); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); c0 = (float*) ((uintptr_t) c0 + cn_stride); c1 = (float*) ((uintptr_t) c1 + cn_stride); nc -= 8; } else { if (nc & 4) { vst1q_f32(c1, vout1x0123); c1 += 4; vout1x0123 = vout1x4567; vst1q_f32(c0, vout0x0123); c0 += 4; vout0x0123 = vout0x4567; } float32x2_t vout1x01 = vget_low_f32(vout1x0123); float32x2_t vout0x01 = vget_low_f32(vout0x0123); if (nc & 2) { vst1_f32(c1, vout1x01); c1 += 2; vst1_f32(c0, vout0x01); c0 += 2; vout1x01 = vget_high_f32(vout1x0123); vout0x01 = vget_high_f32(vout0x0123); } if (nc & 1) { vst1_lane_f32(c1, vout1x01, 0); vst1_lane_f32(c0, vout0x01, 0); } nc = 0; } } while (nc != 0); } void xnn_qs16_qs8_vcvt_ukernel__neon_x32( size_t batch, const int16_t* input, int8_t* output, const union xnn_qs16_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int16_t) == 0); assert(input != NULL); assert(output != NULL); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(int16_t); batch -= 32 * sizeof(int16_t)) { const int16x8_t vx0 = vld1q_s16(input); input += 8; const int16x8_t vx1 = vld1q_s16(input); input += 8; const int16x8_t vx2 = vld1q_s16(input); input += 8; const int16x8_t vx3 = vld1q_s16(input); input += 8; int32x4_t vacc_lo0 = vshll_n_s16(vget_low_s16(vx0), 15); int32x4_t vacc_hi0 = vshll_n_s16(vget_high_s16(vx0), 15); int32x4_t vacc_lo1 = vshll_n_s16(vget_low_s16(vx1), 15); int32x4_t vacc_hi1 = vshll_n_s16(vget_high_s16(vx1), 15); int32x4_t vacc_lo2 = vshll_n_s16(vget_low_s16(vx2), 15); int32x4_t vacc_hi2 = vshll_n_s16(vget_high_s16(vx2), 15); int32x4_t vacc_lo3 = vshll_n_s16(vget_low_s16(vx3), 15); int32x4_t vacc_hi3 = vshll_n_s16(vget_high_s16(vx3), 15); vacc_lo0 = vqrdmulhq_s32(vacc_lo0, vmultiplier); vacc_hi0 = vqrdmulhq_s32(vacc_hi0, vmultiplier); vacc_lo1 = vqrdmulhq_s32(vacc_lo1, vmultiplier); vacc_hi1 = vqrdmulhq_s32(vacc_hi1, vmultiplier); vacc_lo2 = vqrdmulhq_s32(vacc_lo2, vmultiplier); vacc_hi2 = vqrdmulhq_s32(vacc_hi2, vmultiplier); vacc_lo3 = vqrdmulhq_s32(vacc_lo3, vmultiplier); vacc_hi3 = vqrdmulhq_s32(vacc_hi3, vmultiplier); int16x8_t vacc0 = vcombine_s16(vqmovn_s32(vacc_lo0), vqmovn_s32(vacc_hi0)); int16x8_t vacc1 = vcombine_s16(vqmovn_s32(vacc_lo1), vqmovn_s32(vacc_hi1)); int16x8_t vacc2 = vcombine_s16(vqmovn_s32(vacc_lo2), vqmovn_s32(vacc_hi2)); int16x8_t vacc3 = vcombine_s16(vqmovn_s32(vacc_lo3), vqmovn_s32(vacc_hi3)); vacc0 = vqaddq_s16(vacc0, voutput_zero_point); vacc1 = vqaddq_s16(vacc1, voutput_zero_point); vacc2 = vqaddq_s16(vacc2, voutput_zero_point); vacc3 = vqaddq_s16(vacc3, voutput_zero_point); const int8x8_t vy0 = vqmovn_s16(vacc0); const int8x8_t vy1 = vqmovn_s16(vacc1); const int8x8_t vy2 = vqmovn_s16(vacc2); const int8x8_t vy3 = vqmovn_s16(vacc3); vst1_s8(output, vy0); output += 8; vst1_s8(output, vy1); output += 8; vst1_s8(output, vy2); output += 8; vst1_s8(output, vy3); output += 8; } for (; batch >= 8 * sizeof(int16_t); batch -= 8 * sizeof(int16_t)) { const int16x8_t vx = vld1q_s16(input); input += 8; int32x4_t vacc_lo = vshll_n_s16(vget_low_s16(vx), 15); int32x4_t vacc_hi = vshll_n_s16(vget_high_s16(vx), 15); vacc_lo = vqrdmulhq_s32(vacc_lo, vmultiplier); vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier); int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); vacc = vqaddq_s16(vacc, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vacc); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int16_t)); assert(batch <= 7 * sizeof(int16_t)); const int16x8_t vx = vld1q_s16(input); int32x4_t vacc_lo = vshll_n_s16(vget_low_s16(vx), 15); int32x4_t vacc_hi = vshll_n_s16(vget_high_s16(vx), 15); vacc_lo = vqrdmulhq_s32(vacc_lo, vmultiplier); vacc_hi = vqrdmulhq_s32(vacc_hi, vmultiplier); int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); if (batch & (4 * sizeof(int16_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int16_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int16_t))) { vst1_lane_s8((void*) output, vy, 0); } } } void xnn_qs8_dwconv_minmax_rndnu_ukernel_25p16c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8; const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8; const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi9x89ABCDEF = vld1_s8(i9); i9 += 8; const int8x8_t vk9x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi9x89ABCDEF, vk9x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi10x89ABCDEF = vld1_s8(i10); i10 += 8; const int8x8_t vk10x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); vprod89ABCDEF = vmull_s8(vi10x89ABCDEF, vk10x89ABCDEF); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi11x89ABCDEF = vld1_s8(i11); i11 += 8; const int8x8_t vk11x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi11x89ABCDEF, vk11x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi12x89ABCDEF = vld1_s8(i12); i12 += 8; const int8x8_t vk12x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); vprod89ABCDEF = vmull_s8(vi12x89ABCDEF, vk12x89ABCDEF); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi13x89ABCDEF = vld1_s8(i13); i13 += 8; const int8x8_t vk13x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi13x89ABCDEF, vk13x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi14x89ABCDEF = vld1_s8(i14); i14 += 8; const int8x8_t vk14x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); vprod89ABCDEF = vmull_s8(vi14x89ABCDEF, vk14x89ABCDEF); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi15x89ABCDEF = vld1_s8(i15); i15 += 8; const int8x8_t vk15x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi15x89ABCDEF, vk15x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi16x89ABCDEF = vld1_s8(i16); i16 += 8; const int8x8_t vk16x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); vprod89ABCDEF = vmull_s8(vi16x89ABCDEF, vk16x89ABCDEF); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi17x89ABCDEF = vld1_s8(i17); i17 += 8; const int8x8_t vk17x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi17x89ABCDEF, vk17x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi18x89ABCDEF = vld1_s8(i18); i18 += 8; const int8x8_t vk18x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); vprod89ABCDEF = vmull_s8(vi18x89ABCDEF, vk18x89ABCDEF); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi19x89ABCDEF = vld1_s8(i19); i19 += 8; const int8x8_t vk19x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi19x89ABCDEF, vk19x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi20x89ABCDEF = vld1_s8(i20); i20 += 8; const int8x8_t vk20x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); vprod89ABCDEF = vmull_s8(vi20x89ABCDEF, vk20x89ABCDEF); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi21x89ABCDEF = vld1_s8(i21); i21 += 8; const int8x8_t vk21x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi21x89ABCDEF, vk21x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi22x89ABCDEF = vld1_s8(i22); i22 += 8; const int8x8_t vk22x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); vprod89ABCDEF = vmull_s8(vi22x89ABCDEF, vk22x89ABCDEF); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi23x89ABCDEF = vld1_s8(i23); i23 += 8; const int8x8_t vk23x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi23x89ABCDEF, vk23x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi24x89ABCDEF = vld1_s8(i24); i24 += 8; const int8x8_t vk24x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vprod89ABCDEF = vmull_s8(vi24x89ABCDEF, vk24x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vright_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif // !XNN_ARCH_ARM64 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(k); k += 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8((const void*) (k + 136)); vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8((const void*) (k + 152)); vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8((const void*) (k + 168)); vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8((const void*) (k + 184)); vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8((const void*) (k + 200)); vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8((const void*) (k + 216)); vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8((const void*) (k + 232)); vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8((const void*) (k + 248)); vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8((const void*) (k + 264)); vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8((const void*) (k + 280)); vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8((const void*) (k + 296)); vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8((const void*) (k + 312)); vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8((const void*) (k + 328)); vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8((const void*) (k + 344)); vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8((const void*) (k + 360)); vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8((const void*) (k + 376)); vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_s8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 8; c -= 8) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); vst1_s8(output, vout01234567); output += 8; } if XNN_UNLIKELY(c != 0) { { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); const int8x8_t vk0x01234567 = vld1_s8(w); int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); const int8x8_t vk1x01234567 = vld1_s8((const void*) ((const int8_t*) w + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); const int8x8_t vk2x01234567 = vld1_s8((const void*) ((const int8_t*) w + 16)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); const int8x8_t vk3x01234567 = vld1_s8((const void*) ((const int8_t*) w + 24)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); const int8x8_t vk4x01234567 = vld1_s8((const void*) ((const int8_t*) w + 32)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); const int8x8_t vk5x01234567 = vld1_s8((const void*) ((const int8_t*) w + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); const int8x8_t vk6x01234567 = vld1_s8((const void*) ((const int8_t*) w + 48)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); const int8x8_t vk7x01234567 = vld1_s8((const void*) ((const int8_t*) w + 56)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); const int8x8_t vk8x01234567 = vld1_s8((const void*) ((const int8_t*) w + 64)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); const int8x8_t vk9x01234567 = vld1_s8((const void*) ((const int8_t*) w + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); const int8x8_t vk10x01234567 = vld1_s8((const void*) ((const int8_t*) w + 80)); vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); const int8x8_t vk11x01234567 = vld1_s8((const void*) ((const int8_t*) w + 88)); vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); const int8x8_t vk12x01234567 = vld1_s8((const void*) ((const int8_t*) w + 96)); vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); const int8x8_t vk13x01234567 = vld1_s8((const void*) ((const int8_t*) w + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); const int8x8_t vk14x01234567 = vld1_s8((const void*) ((const int8_t*) w + 112)); vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); const int8x8_t vk15x01234567 = vld1_s8((const void*) ((const int8_t*) w + 120)); vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); const int8x8_t vk16x01234567 = vld1_s8((const void*) ((const int8_t*) w + 128)); vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); const int8x8_t vk17x01234567 = vld1_s8((const void*) ((const int8_t*) w + 136)); vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); const int8x8_t vk18x01234567 = vld1_s8((const void*) ((const int8_t*) w + 144)); vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); const int8x8_t vk19x01234567 = vld1_s8((const void*) ((const int8_t*) w + 152)); vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); const int8x8_t vk20x01234567 = vld1_s8((const void*) ((const int8_t*) w + 160)); vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); const int8x8_t vk21x01234567 = vld1_s8((const void*) ((const int8_t*) w + 168)); vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); const int8x8_t vk22x01234567 = vld1_s8((const void*) ((const int8_t*) w + 176)); vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); const int8x8_t vk23x01234567 = vld1_s8((const void*) ((const int8_t*) w + 184)); vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); const int8x8_t vk24x01234567 = vld1_s8((const void*) ((const int8_t*) w + 192)); vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } } } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8; const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8; const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vright_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif // !XNN_ARCH_ARM64 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(k); k += 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_s8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_f32_vcvt_ukernel__neon_x32( size_t batch, const int8_t* input, float* output, const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vminus_zero_point = vreinterpretq_s16_u32(vld1q_dup_u32((const void*) params->neon.minus_zero_point)); const float32x4_t vscale = vld1q_dup_f32(¶ms->neon.scale); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x8_t vx01234567 = vld1_s8(input); input += 8; const int8x8_t vx89ABCDEF = vld1_s8(input); input += 8; const int8x8_t vxGHIJKLMN = vld1_s8(input); input += 8; const int8x8_t vxOPQRSTUV = vld1_s8(input); input += 8; const int16x8_t vhx01234567 = vaddw_s8(vminus_zero_point, vx01234567); const int16x8_t vhx89ABCDEF = vaddw_s8(vminus_zero_point, vx89ABCDEF); const int16x8_t vhxGHIJKLMN = vaddw_s8(vminus_zero_point, vxGHIJKLMN); const int16x8_t vhxOPQRSTUV = vaddw_s8(vminus_zero_point, vxOPQRSTUV); const int32x4_t vwx0123 = vmovl_s16(vget_low_s16(vhx01234567)); const int32x4_t vwx4567 = vmovl_s16(vget_high_s16(vhx01234567)); const int32x4_t vwx89AB = vmovl_s16(vget_low_s16(vhx89ABCDEF)); const int32x4_t vwxCDEF = vmovl_s16(vget_high_s16(vhx89ABCDEF)); const int32x4_t vwxGHIJ = vmovl_s16(vget_low_s16(vhxGHIJKLMN)); const int32x4_t vwxKLMN = vmovl_s16(vget_high_s16(vhxGHIJKLMN)); const int32x4_t vwxOPQR = vmovl_s16(vget_low_s16(vhxOPQRSTUV)); const int32x4_t vwxSTUV = vmovl_s16(vget_high_s16(vhxOPQRSTUV)); float32x4_t vy0123 = vcvtq_f32_s32(vwx0123); float32x4_t vy4567 = vcvtq_f32_s32(vwx4567); float32x4_t vy89AB = vcvtq_f32_s32(vwx89AB); float32x4_t vyCDEF = vcvtq_f32_s32(vwxCDEF); float32x4_t vyGHIJ = vcvtq_f32_s32(vwxGHIJ); float32x4_t vyKLMN = vcvtq_f32_s32(vwxKLMN); float32x4_t vyOPQR = vcvtq_f32_s32(vwxOPQR); float32x4_t vySTUV = vcvtq_f32_s32(vwxSTUV); vy0123 = vmulq_f32(vy0123, vscale); vy4567 = vmulq_f32(vy4567, vscale); vy89AB = vmulq_f32(vy89AB, vscale); vyCDEF = vmulq_f32(vyCDEF, vscale); vyGHIJ = vmulq_f32(vyGHIJ, vscale); vyKLMN = vmulq_f32(vyKLMN, vscale); vyOPQR = vmulq_f32(vyOPQR, vscale); vySTUV = vmulq_f32(vySTUV, vscale); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; vst1q_f32(output, vy89AB); output += 4; vst1q_f32(output, vyCDEF); output += 4; vst1q_f32(output, vyGHIJ); output += 4; vst1q_f32(output, vyKLMN); output += 4; vst1q_f32(output, vyOPQR); output += 4; vst1q_f32(output, vySTUV); output += 4; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; const int16x8_t vhx = vaddw_s8(vminus_zero_point, vx); const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx)); const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx)); float32x4_t vy_lo = vcvtq_f32_s32(vwx_lo); float32x4_t vy_hi = vcvtq_f32_s32(vwx_hi); vy_lo = vmulq_f32(vy_lo, vscale); vy_hi = vmulq_f32(vy_hi, vscale); vst1q_f32(output, vy_lo); output += 4; vst1q_f32(output, vy_hi); output += 4; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); const int16x8_t vhx = vaddw_s8(vminus_zero_point, vx); const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx)); const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx)); float32x4_t vy = vcvtq_f32_s32(vwx_lo); vy = vmulq_f32(vy, vscale); if (batch & (4 * sizeof(int8_t))) { vst1q_f32(output, vy); output += 4; vy = vcvtq_f32_s32(vwx_hi); vy = vmulq_f32(vy, vscale); } float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(int8_t))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8( size_t rows, size_t channels, const int8_t* input, size_t input_stride, const int8_t* zero, int32_t* buffer, int8_t* output, const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const int8_t* i0 = input; const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 8)) { const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); vst1q_s32(b, vacc0123); b += 4; vst1q_s32(b, vacc4567); b += 4; } for (rows -= 7; rows > 7; rows -= 7) { i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 8)) { const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(b); int32x4_t vacc4567 = vld1q_s32(b + 4); vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); vst1q_s32(b, vacc0123); b += 4; vst1q_s32(b, vacc4567); b += 4; } } i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); for (; channels >= 8; channels -= 8) { const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif // !XNN_ARCH_ARM64 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); #if XNN_ARCH_ARM64 int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #else // !XNN_ARCH_ARM64 int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); vst1_s8(output, vout01234567); output += 8; } if XNN_UNLIKELY(channels != 0) { { const int8x8_t vi0x01234567 = vld1_s8(i0); const int8x8_t vi1x01234567 = vld1_s8(i1); const int8x8_t vi2x01234567 = vld1_s8(i2); int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); if (channels & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (channels & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (channels & 1) { vst1_lane_s8(output, vout01234567, 0); } } } } void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8( size_t rows, size_t channels, const int8_t* input, size_t input_stride, const int8_t* zero, int8_t* output, const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const int8_t* i0 = input; const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); for (; channels >= 8; channels -= 8) { const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif // !XNN_ARCH_ARM64 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); #if XNN_ARCH_ARM64 int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #else // !XNN_ARCH_ARM64 int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); vst1_s8(output, vout01234567); output += 8; } if XNN_UNLIKELY(channels != 0) { { const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); if (channels & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (channels & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (channels & 1) { vst1_lane_s8(output, vout01234567, 0); } } } } void xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 16; } else { int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); if (nc & 8) { vst1_s8(c0, vout0x01234567); c0 += 8; vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); } if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 8; } else { // Final case where not all of the 8 columns fit in the destination. if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 2); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr != 2) { a1 = a0; c1 = c0; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; int8x8_t va1x1 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567)); vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567)); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); nc -= 8; } else { // Final case where not all of the 8 columns fit in the destination. if (nc & 4) { vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0); vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); } } } } } } } p -= 1 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 16; } else { int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); if (nc & 8) { vst1_s8(c0, vout0x01234567); c0 += 8; vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); } if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } p -= 1 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); } p -= 1 * sizeof(void*); } while (p != 0); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 2); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (2 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr != 2) { c1 = c0; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } const int8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); } a += 2; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; int8x8_t va1x1 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); } p -= 2 * sizeof(void*); } while (p != 0); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max); if (nc >= 8) { vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567)); vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567)); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4; vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2; vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8); vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8; const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8; const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi9x89ABCDEF = vld1_s8(i9); i9 += 8; const int8x8_t vk9x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi9x89ABCDEF, vk9x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi10x89ABCDEF = vld1_s8(i10); i10 += 8; const int8x8_t vk10x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); vprod89ABCDEF = vmull_s8(vi10x89ABCDEF, vk10x89ABCDEF); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi11x89ABCDEF = vld1_s8(i11); i11 += 8; const int8x8_t vk11x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi11x89ABCDEF, vk11x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi12x89ABCDEF = vld1_s8(i12); i12 += 8; const int8x8_t vk12x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); vprod89ABCDEF = vmull_s8(vi12x89ABCDEF, vk12x89ABCDEF); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi13x89ABCDEF = vld1_s8(i13); i13 += 8; const int8x8_t vk13x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi13x89ABCDEF, vk13x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi14x89ABCDEF = vld1_s8(i14); i14 += 8; const int8x8_t vk14x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); vprod89ABCDEF = vmull_s8(vi14x89ABCDEF, vk14x89ABCDEF); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi15x89ABCDEF = vld1_s8(i15); i15 += 8; const int8x8_t vk15x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi15x89ABCDEF, vk15x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi16x89ABCDEF = vld1_s8(i16); i16 += 8; const int8x8_t vk16x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); vprod89ABCDEF = vmull_s8(vi16x89ABCDEF, vk16x89ABCDEF); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi17x89ABCDEF = vld1_s8(i17); i17 += 8; const int8x8_t vk17x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi17x89ABCDEF, vk17x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi18x89ABCDEF = vld1_s8(i18); i18 += 8; const int8x8_t vk18x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); vprod89ABCDEF = vmull_s8(vi18x89ABCDEF, vk18x89ABCDEF); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi19x89ABCDEF = vld1_s8(i19); i19 += 8; const int8x8_t vk19x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi19x89ABCDEF, vk19x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi20x89ABCDEF = vld1_s8(i20); i20 += 8; const int8x8_t vk20x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); vprod89ABCDEF = vmull_s8(vi20x89ABCDEF, vk20x89ABCDEF); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi21x89ABCDEF = vld1_s8(i21); i21 += 8; const int8x8_t vk21x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi21x89ABCDEF, vk21x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi22x89ABCDEF = vld1_s8(i22); i22 += 8; const int8x8_t vk22x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); vprod89ABCDEF = vmull_s8(vi22x89ABCDEF, vk22x89ABCDEF); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi23x89ABCDEF = vld1_s8(i23); i23 += 8; const int8x8_t vk23x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi23x89ABCDEF, vk23x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi24x89ABCDEF = vld1_s8(i24); i24 += 8; const int8x8_t vk24x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vprod89ABCDEF = vmull_s8(vi24x89ABCDEF, vk24x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4; vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB); vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif // !XNN_ARCH_ARM64 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(k); k += 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8((const void*) (k + 136)); vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8((const void*) (k + 152)); vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8((const void*) (k + 168)); vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8((const void*) (k + 184)); vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8((const void*) (k + 200)); vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8((const void*) (k + 216)); vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8((const void*) (k + 232)); vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8((const void*) (k + 248)); vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8((const void*) (k + 264)); vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8((const void*) (k + 280)); vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8((const void*) (k + 296)); vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8((const void*) (k + 312)); vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8((const void*) (k + 328)); vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8((const void*) (k + 344)); vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8((const void*) (k + 360)); vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8((const void*) (k + 376)); vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 400 * sizeof(int8_t))); const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 400 * sizeof(int8_t) + 4 * sizeof(float))); vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_s8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p8c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 8; c -= 8) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8; const int8x8_t vk9x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8; const int8x8_t vk10x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8; const int8x8_t vk11x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8; const int8x8_t vk12x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8; const int8x8_t vk13x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8; const int8x8_t vk14x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8; const int8x8_t vk15x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8; const int8x8_t vk16x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8; const int8x8_t vk17x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8; const int8x8_t vk18x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8; const int8x8_t vk19x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8; const int8x8_t vk20x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8; const int8x8_t vk21x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8; const int8x8_t vk22x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8; const int8x8_t vk23x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8; const int8x8_t vk24x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4; vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); vst1_s8(output, vout01234567); output += 8; } if XNN_UNLIKELY(c != 0) { { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); const int8x8_t vk0x01234567 = vld1_s8(w); int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); const int8x8_t vk1x01234567 = vld1_s8((const void*) ((const int8_t*) w + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); const int8x8_t vk2x01234567 = vld1_s8((const void*) ((const int8_t*) w + 16)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); const int8x8_t vk3x01234567 = vld1_s8((const void*) ((const int8_t*) w + 24)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); const int8x8_t vk4x01234567 = vld1_s8((const void*) ((const int8_t*) w + 32)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); const int8x8_t vk5x01234567 = vld1_s8((const void*) ((const int8_t*) w + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); const int8x8_t vk6x01234567 = vld1_s8((const void*) ((const int8_t*) w + 48)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); const int8x8_t vk7x01234567 = vld1_s8((const void*) ((const int8_t*) w + 56)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); const int8x8_t vk8x01234567 = vld1_s8((const void*) ((const int8_t*) w + 64)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); const int8x8_t vi9x01234567 = vld1_s8(i9); const int8x8_t vk9x01234567 = vld1_s8((const void*) ((const int8_t*) w + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi10x01234567 = vld1_s8(i10); const int8x8_t vk10x01234567 = vld1_s8((const void*) ((const int8_t*) w + 80)); vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567); const int8x8_t vi11x01234567 = vld1_s8(i11); const int8x8_t vk11x01234567 = vld1_s8((const void*) ((const int8_t*) w + 88)); vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi12x01234567 = vld1_s8(i12); const int8x8_t vk12x01234567 = vld1_s8((const void*) ((const int8_t*) w + 96)); vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567); const int8x8_t vi13x01234567 = vld1_s8(i13); const int8x8_t vk13x01234567 = vld1_s8((const void*) ((const int8_t*) w + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi14x01234567 = vld1_s8(i14); const int8x8_t vk14x01234567 = vld1_s8((const void*) ((const int8_t*) w + 112)); vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567); const int8x8_t vi15x01234567 = vld1_s8(i15); const int8x8_t vk15x01234567 = vld1_s8((const void*) ((const int8_t*) w + 120)); vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi16x01234567 = vld1_s8(i16); const int8x8_t vk16x01234567 = vld1_s8((const void*) ((const int8_t*) w + 128)); vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567); const int8x8_t vi17x01234567 = vld1_s8(i17); const int8x8_t vk17x01234567 = vld1_s8((const void*) ((const int8_t*) w + 136)); vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi18x01234567 = vld1_s8(i18); const int8x8_t vk18x01234567 = vld1_s8((const void*) ((const int8_t*) w + 144)); vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567); const int8x8_t vi19x01234567 = vld1_s8(i19); const int8x8_t vk19x01234567 = vld1_s8((const void*) ((const int8_t*) w + 152)); vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi20x01234567 = vld1_s8(i20); const int8x8_t vk20x01234567 = vld1_s8((const void*) ((const int8_t*) w + 160)); vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567); const int8x8_t vi21x01234567 = vld1_s8(i21); const int8x8_t vk21x01234567 = vld1_s8((const void*) ((const int8_t*) w + 168)); vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi22x01234567 = vld1_s8(i22); const int8x8_t vk22x01234567 = vld1_s8((const void*) ((const int8_t*) w + 176)); vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567); const int8x8_t vi23x01234567 = vld1_s8(i23); const int8x8_t vk23x01234567 = vld1_s8((const void*) ((const int8_t*) w + 184)); vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi24x01234567 = vld1_s8(i24); const int8x8_t vk24x01234567 = vld1_s8((const void*) ((const int8_t*) w + 192)); vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 0 * sizeof(int32_t) + 200 * sizeof(int8_t))); const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 0 * sizeof(int32_t) + 200 * sizeof(int8_t) + 4 * sizeof(float))); vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, voutput_min); vout01234567 = vmin_s8(vout01234567, voutput_max); if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } } } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x16_t vi0x0123456789ABCDEF = vld1q_s8(i0); i0 += 16; const int8x16_t vk0x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16; int16x8_t vprod01234567 = vmull_s8(vget_low_s8(vi0x0123456789ABCDEF), vget_low_s8(vk0x0123456789ABCDEF)); int16x8_t vprod89ABCDEF = vmull_s8(vget_high_s8(vi0x0123456789ABCDEF), vget_high_s8(vk0x0123456789ABCDEF)); const int8x16_t vi1x0123456789ABCDEF = vld1q_s8(i1); i1 += 16; const int8x16_t vk1x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16; vprod01234567 = vmlal_s8(vprod01234567, vget_low_s8(vi1x0123456789ABCDEF), vget_low_s8(vk1x0123456789ABCDEF)); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vget_high_s8(vi1x0123456789ABCDEF), vget_high_s8(vk1x0123456789ABCDEF)); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x16_t vi2x0123456789ABCDEF = vld1q_s8(i2); i2 += 16; const int8x16_t vk2x0123456789ABCDEF = vld1q_s8(w); w = (const int8_t*) w + 16; vprod01234567 = vmull_s8(vget_low_s8(vi2x0123456789ABCDEF), vget_low_s8(vk2x0123456789ABCDEF)); vprod89ABCDEF = vmull_s8(vget_high_s8(vi2x0123456789ABCDEF), vget_high_s8(vk2x0123456789ABCDEF)); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4; vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB); vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif // !XNN_ARCH_ARM64 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(k); k += 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t))); const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t) + 4 * sizeof(float))); vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_s8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__neon_mla8_ld64( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; const int8x8_t vk0x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); int16x8_t vprod89ABCDEF = vmull_s8(vi0x89ABCDEF, vk0x89ABCDEF); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; const int8x8_t vk1x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi1x89ABCDEF, vk1x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; const int8x8_t vk2x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); vprod89ABCDEF = vmull_s8(vi2x89ABCDEF, vk2x89ABCDEF); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; const int8x8_t vk3x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi3x89ABCDEF, vk3x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; const int8x8_t vk4x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); vprod89ABCDEF = vmull_s8(vi4x89ABCDEF, vk4x89ABCDEF); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; const int8x8_t vk5x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi5x89ABCDEF, vk5x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; const int8x8_t vk6x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); vprod89ABCDEF = vmull_s8(vi6x89ABCDEF, vk6x89ABCDEF); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi7x89ABCDEF = vld1_s8(i7); i7 += 8; const int8x8_t vk7x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vprod89ABCDEF = vmlal_s8(vprod89ABCDEF, vi7x89ABCDEF, vk7x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vi8x89ABCDEF = vld1_s8(i8); i8 += 8; const int8x8_t vk8x89ABCDEF = vld1_s8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vprod89ABCDEF = vmull_s8(vi8x89ABCDEF, vk8x89ABCDEF); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vprod89ABCDEF)); vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vprod89ABCDEF)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const float*) w + 4; const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const float*) w + 4; vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vfpacc89AB = vmulq_f32(vfpacc89AB, vscale89AB); vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscaleCDEF); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif // !XNN_ARCH_ARM64 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vk0x01234567 = vld1_s8(k); k += 8; int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567); const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vk1x01234567 = vld1_s8((const void*) (k + 8)); vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vk2x01234567 = vld1_s8((const void*) (k + 24)); vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567); const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vk3x01234567 = vld1_s8((const void*) (k + 40)); vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; const int8x8_t vk4x01234567 = vld1_s8((const void*) (k + 56)); vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567); const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; const int8x8_t vk5x01234567 = vld1_s8((const void*) (k + 72)); vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; const int8x8_t vk6x01234567 = vld1_s8((const void*) (k + 88)); vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567); const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8; const int8x8_t vk7x01234567 = vld1_s8((const void*) (k + 104)); vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8; const int8x8_t vk8x01234567 = vld1_s8((const void*) (k + 120)); vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567); vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567)); vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567)); float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); const float32x4_t vscale0123 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t))); const float32x4_t vscale4567 = vld1q_f32((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t) + 4 * sizeof(float))); vfpacc0123 = vmulq_f32(vfpacc0123, vscale0123); vfpacc4567 = vmulq_f32(vfpacc4567, vscale4567); vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_s8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_s8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); } float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 8; } else { // Final case where not all of the 8 columns fit in the destination. if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 2); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr != 2) { a1 = a0; c1 = c0; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; int8x8_t va1x1 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); } float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123); float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567); const float32x4_t vscale0123 = vld1q_f32(w); w = (const float*) w + 4; vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32(w); w = (const float*) w + 4; vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias)); vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point); vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567)); vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567)); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); nc -= 8; } else { // Final case where not all of the 8 columns fit in the destination. if (nc & 4) { vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0); vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 8 * sizeof(int8_t)) { const int8x8_t va0 = vld1_s8(a0); a0 += 8; const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(int8_t); } if XNN_UNLIKELY(k != 0) { const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vmovl_s8(va0); const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(int8_t)) { const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(int8_t)) { const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(int8_t)) { const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(int8_t)) { const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(int8_t)) { const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(int8_t)) { const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8); const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } p -= 1 * sizeof(void*); } while (p != 0); // Post-accumulation work float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); } p -= 1 * sizeof(void*); } while (p != 0); float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567); #endif const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567 = vmax_s8(vout0x01234567, voutput_min); const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567 = vmin_s8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_s8(c0 + 0, vout0x01234567); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_s8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 2); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (2 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr != 2) { c1 = c0; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } const int8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); } a += 2; size_t k = kc; while (k >= 16 * sizeof(int8_t)) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va0x1 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; int8x8_t va1x1 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1); vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1); vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1); vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1); vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1); vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1); vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va0x1 = vext_s8(va0x1, va0x1, 2); va1x0 = vext_s8(va1x0, va1x0, 2); va1x1 = vext_s8(va1x1, va1x1, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1); vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const int8_t*) w + 8; vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1); vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); k -= 16 * sizeof(int8_t); } if (k != 0) { int8x8_t va0x0 = vld1_s8(a0); a0 += 8; int8x8_t va1x0 = vld1_s8(a1); a1 += 8; const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8; int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0); int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1); int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2); int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2); va0x0 = vext_s8(va0x0, va0x0, 2); va1x0 = vext_s8(va1x0, va1x0, 2); int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0); int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0); vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3); vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3); int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0); int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0); vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3); vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3); } p -= 2 * sizeof(void*); } while (p != 0); float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123); float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567); float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123); float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567); const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123); vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123); const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4); vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567); vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias)); vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias)); vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias)); vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias)); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point); vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point); vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point); vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567)); #endif const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max); if (nc >= 8) { vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567)); vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567)); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4; vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2; vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2; vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8); vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point); const int8x8_t vb_zero_point = vld1_dup_s8(¶ms->neon.b_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier); vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point); const int8x8_t vb_zero_point = vld1_dup_s8(¶ms->neon.b_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8; const int8x8_t vaGHIJKLMN = vld1_s8(input_a); input_a += 8; const int8x8_t vbGHIJKLMN = vld1_s8(input_b); input_b += 8; const int8x8_t vaOPQRSTUV = vld1_s8(input_a); input_a += 8; const int8x8_t vbOPQRSTUV = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point); const int16x8_t vxaGHIJKLMN = vsubl_s8(vaGHIJKLMN, va_zero_point); const int16x8_t vxbGHIJKLMN = vsubl_s8(vbGHIJKLMN, vb_zero_point); const int16x8_t vxaOPQRSTUV = vsubl_s8(vaOPQRSTUV, va_zero_point); const int16x8_t vxbOPQRSTUV = vsubl_s8(vbOPQRSTUV, vb_zero_point); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccGHIJ = vmulq_s32(vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccKLMN = vmulq_s32(vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccOPQR = vmulq_s32(vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier); int32x4_t vaccSTUV = vmulq_s32(vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier); vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier); vaccGHIJ = vmlaq_s32(vaccGHIJ, vmovl_s16(vget_low_s16(vxbGHIJKLMN)), vb_multiplier); vaccKLMN = vmlaq_s32(vaccKLMN, vmovl_s16(vget_high_s16(vxbGHIJKLMN)), vb_multiplier); vaccOPQR = vmlaq_s32(vaccOPQR, vmovl_s16(vget_low_s16(vxbOPQRSTUV)), vb_multiplier); vaccSTUV = vmlaq_s32(vaccSTUV, vmovl_s16(vget_high_s16(vxbOPQRSTUV)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift); vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift); vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift); vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point); const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max); const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point; const int32_t vb = params->neon.b_multiplier; const int32x4_t vbias = vdupq_n_s32(vxb * vb); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max); const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point; const int32_t vb = params->neon.b_multiplier; const int32x4_t vbias = vdupq_n_s32(vxb * vb); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int8x8_t vaGHIJKLMN = vld1_s8(input_a); input_a += 8; const int8x8_t vaOPQRSTUV = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); const int16x8_t vxaGHIJKLMN = vsubl_s8(vaGHIJKLMN, va_zero_point); const int16x8_t vxaOPQRSTUV = vsubl_s8(vaOPQRSTUV, va_zero_point); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccGHIJ = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccKLMN = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccOPQR = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier); int32x4_t vaccSTUV = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift); vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift); vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift); vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point); const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point); int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vcvt_ukernel__neon_x32( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vinput_zero_point = vld1q_dup_s16(¶ms->neon.input_zero_point); const int16x8_t vmultiplier = vld1q_dup_s16(¶ms->neon.multiplier); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x16_t vx0 = vld1q_s8(input); input += 16; const int8x16_t vx1 = vld1q_s8(input); input += 16; int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0)); int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0)); int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1)); int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1)); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); vacc0 = vqrdmulhq_s16(vacc0, vmultiplier); vacc1 = vqrdmulhq_s16(vacc1, vmultiplier); vacc2 = vqrdmulhq_s16(vacc2, vmultiplier); vacc3 = vqrdmulhq_s16(vacc3, vmultiplier); vacc0 = vqaddq_s16(vacc0, voutput_zero_point); vacc1 = vqaddq_s16(vacc1, voutput_zero_point); vacc2 = vqaddq_s16(vacc2, voutput_zero_point); vacc3 = vqaddq_s16(vacc3, voutput_zero_point); const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc1)); const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vacc2), vqmovn_s16(vacc3)); vst1q_s8(output, vy0); output += 16; vst1q_s8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vacc); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vy, 0); } } } void xnn_qs8_vhswish_ukernel__neon_x16( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vinput_zero_point = vld1q_dup_s16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { const int8x16_t vx0 = vld1q_s8(input); input += 16; int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0)); int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0)); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa); int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa); vin0 = vqshlq_s16(vin0, vinput_scale_div_exp); vin1 = vqshlq_s16(vin1, vinput_scale_div_exp); vin0 = vqsubq_s16(vin0, vhalf); vin1 = vqsubq_s16(vin1, vhalf); vin0 = vminq_s16(vin0, vzero); vin1 = vminq_s16(vin1, vzero); int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio); int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio); vout0 = vqdmulhq_s16(vout0, vin0); vout1 = vqdmulhq_s16(vout1, vin1); vout0 = vqaddq_s16(vout0, voutput_zero_point); vout1 = vqaddq_s16(vout1, voutput_zero_point); const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vout0), vqmovn_s16(vout1)); vst1q_s8(output, vy0); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vout); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); int8x8_t vy = vqmovn_s16(vout); if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vy, 0); } } } void xnn_qs8_vhswish_ukernel__neon_x32( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vinput_zero_point = vld1q_dup_s16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x16_t vx0 = vld1q_s8(input); input += 16; const int8x16_t vx1 = vld1q_s8(input); input += 16; int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0)); int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0)); int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1)); int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1)); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa); int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa); int16x8_t vin2 = vqdmulhq_s16(vacc2, vinput_scale_div_mantissa); int16x8_t vin3 = vqdmulhq_s16(vacc3, vinput_scale_div_mantissa); vin0 = vqshlq_s16(vin0, vinput_scale_div_exp); vin1 = vqshlq_s16(vin1, vinput_scale_div_exp); vin2 = vqshlq_s16(vin2, vinput_scale_div_exp); vin3 = vqshlq_s16(vin3, vinput_scale_div_exp); vin0 = vqsubq_s16(vin0, vhalf); vin1 = vqsubq_s16(vin1, vhalf); vin2 = vqsubq_s16(vin2, vhalf); vin3 = vqsubq_s16(vin3, vhalf); vin0 = vminq_s16(vin0, vzero); vin1 = vminq_s16(vin1, vzero); vin2 = vminq_s16(vin2, vzero); vin3 = vminq_s16(vin3, vzero); int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio); int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio); int16x8_t vout2 = vqdmulhq_s16(vacc2, vscale_ratio); int16x8_t vout3 = vqdmulhq_s16(vacc3, vscale_ratio); vout0 = vqdmulhq_s16(vout0, vin0); vout1 = vqdmulhq_s16(vout1, vin1); vout2 = vqdmulhq_s16(vout2, vin2); vout3 = vqdmulhq_s16(vout3, vin3); vout0 = vqaddq_s16(vout0, voutput_zero_point); vout1 = vqaddq_s16(vout1, voutput_zero_point); vout2 = vqaddq_s16(vout2, voutput_zero_point); vout3 = vqaddq_s16(vout3, voutput_zero_point); const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vout0), vqmovn_s16(vout1)); const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vout2), vqmovn_s16(vout3)); vst1q_s8(output, vy0); output += 16; vst1q_s8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vout); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); int8x8_t vy = vqmovn_s16(vout); if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vy, 0); } } } void xnn_qs8_vhswish_ukernel__neon_x8( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vinput_zero_point = vld1q_dup_s16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vout); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); int8x8_t vy = vqmovn_s16(vout); if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vy, 0); } } } void xnn_qs8_vlrelu_ukernel__neon_x32( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vinput_zero_point = vld1q_dup_s16(¶ms->neon.input_zero_point); const int16x8_t vpositive_multiplier = vld1q_dup_s16(¶ms->neon.positive_multiplier); const int16x8_t vnegative_multiplier = vld1q_dup_s16(¶ms->neon.negative_multiplier); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const int8x16_t vx0 = vld1q_s8(input); input += 16; const int8x16_t vx1 = vld1q_s8(input); input += 16; int16x8_t vacc0 = vsubw_s8(vinput_zero_point, vget_low_s8(vx0)); int16x8_t vacc1 = vsubw_s8(vinput_zero_point, vget_high_s8(vx0)); int16x8_t vacc2 = vsubw_s8(vinput_zero_point, vget_low_s8(vx1)); int16x8_t vacc3 = vsubw_s8(vinput_zero_point, vget_high_s8(vx1)); const uint16x8_t vmask0 = vcltq_s16(vacc0, vmovq_n_s16(0)); const uint16x8_t vmask1 = vcltq_s16(vacc1, vmovq_n_s16(0)); const uint16x8_t vmask2 = vcltq_s16(vacc2, vmovq_n_s16(0)); const uint16x8_t vmask3 = vcltq_s16(vacc3, vmovq_n_s16(0)); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); const int16x8_t vmultiplier0 = vbslq_s16(vmask0, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier1 = vbslq_s16(vmask1, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier2 = vbslq_s16(vmask2, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier3 = vbslq_s16(vmask3, vpositive_multiplier, vnegative_multiplier); vacc0 = vqrdmulhq_s16(vacc0, vmultiplier0); vacc1 = vqrdmulhq_s16(vacc1, vmultiplier1); vacc2 = vqrdmulhq_s16(vacc2, vmultiplier2); vacc3 = vqrdmulhq_s16(vacc3, vmultiplier3); vacc0 = vqaddq_s16(vacc0, voutput_zero_point); vacc1 = vqaddq_s16(vacc1, voutput_zero_point); vacc2 = vqaddq_s16(vacc2, voutput_zero_point); vacc3 = vqaddq_s16(vacc3, voutput_zero_point); const int8x16_t vy0 = vcombine_s8(vqmovn_s16(vacc0), vqmovn_s16(vacc1)); const int8x16_t vy1 = vcombine_s8(vqmovn_s16(vacc2), vqmovn_s16(vacc3)); vst1q_s8(output, vy0); output += 16; vst1q_s8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const int8x8_t vx = vld1_s8(input); input += 8; int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0)); vacc = vshlq_n_s16(vacc, 7); const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); const int8x8_t vy = vqmovn_s16(vacc); vst1_s8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const int8x8_t vx = vld1_s8(input); int16x8_t vacc = vsubw_s8(vinput_zero_point, vx); const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0)); vacc = vshlq_n_s16(vacc, 7); const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); int8x8_t vy = vqmovn_s16(vacc); if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vy), 0); output += 4; vy = vext_s8(vy, vy, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vy), 0); output += 2; vy = vext_s8(vy, vy, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vy, 0); } } } void xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(params->rndnu_neon.a_zero_point); const int8x8_t vb_zero_point = vld1_dup_s8(params->rndnu_neon.b_zero_point); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int8x8_t vb89ABCDEF = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); const int16x8_t vxb89ABCDEF = vsubl_s8(vb89ABCDEF, vb_zero_point); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567)); int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb89ABCDEF)); int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb89ABCDEF)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); #if XNN_ARCH_ARM64 int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const int8x8_t va_zero_point = vld1_dup_s8(params->rndnu_neon.a_zero_point); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); const int8x8_t vb = vld1_dup_s8(input_b); const int8x8_t vb_zero_point = vld1_dup_s8(params->rndnu_neon.b_zero_point); const int16x8_t vxb = vsubl_s8(vb, vb_zero_point); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int8x8_t va89ABCDEF = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); const int16x8_t vxa89ABCDEF = vsubl_s8(va89ABCDEF, va_zero_point); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb)); int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb)); int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); #if XNN_ARCH_ARM64 int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); #else int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); #endif vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); vst1q_s8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8; const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); int8x8_t vout01234567 = vqmovn_s16(vacc01234567); vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { vst1_s8(output, vout01234567); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; vout01234567 = vext_s8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; vout01234567 = vext_s8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_avgpool_minmax_fp32_ukernel_9p8x__neon_c8( size_t output_pixels, size_t kernel_elements, size_t channels, const uint8_t** input, size_t input_offset, const uint8_t* zero, int32_t* buffer, uint8_t* output, size_t input_increment, size_t output_increment, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max); do { { const uint8_t* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } const uint8_t* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } int32_t* b = buffer; for (size_t c = 0; c < channels; c += 8) { const uint8x8_t vi0 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2 = vld1_u8(i2); i2 += 8; const uint8x8_t vi3 = vld1_u8(i3); i3 += 8; const uint8x8_t vi4 = vld1_u8(i4); i4 += 8; const uint8x8_t vi5 = vld1_u8(i5); i5 += 8; const uint8x8_t vi6 = vld1_u8(i6); i6 += 8; const uint8x8_t vi7 = vld1_u8(i7); i7 += 8; const uint8x8_t vi8 = vld1_u8(i8); i8 += 8; const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8); const uint16x8_t vsum23 = vaddl_u8(vi2, vi3); const uint16x8_t vsum45 = vaddl_u8(vi4, vi5); const uint16x8_t vsum67 = vaddl_u8(vi6, vi7); const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45); const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67); const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678); const int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum))); const int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum))); vst1q_s32(b, vacc_lo); b += 4; vst1q_s32(b, vacc_hi); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const uint8_t* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } int32_t* b = buffer; for (size_t c = 0; c < channels; c += 8) { const uint8x8_t vi0 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2 = vld1_u8(i2); i2 += 8; const uint8x8_t vi3 = vld1_u8(i3); i3 += 8; const uint8x8_t vi4 = vld1_u8(i4); i4 += 8; const uint8x8_t vi5 = vld1_u8(i5); i5 += 8; const uint8x8_t vi6 = vld1_u8(i6); i6 += 8; const uint8x8_t vi7 = vld1_u8(i7); i7 += 8; int32x4_t vacc_lo = vld1q_s32(b); int32x4_t vacc_hi = vld1q_s32(b + 4); const uint16x8_t vsum01 = vaddl_u8(vi0, vi1); const uint16x8_t vsum23 = vaddl_u8(vi2, vi3); const uint16x8_t vsum45 = vaddl_u8(vi4, vi5); const uint16x8_t vsum67 = vaddl_u8(vi6, vi7); const uint16x8_t vsum0123 = vaddq_u16(vsum01, vsum23); const uint16x8_t vsum4567 = vaddq_u16(vsum45, vsum67); const uint16x8_t vsum = vaddq_u16(vsum0123, vsum4567); vacc_lo = vaddw_s16(vacc_lo, vreinterpret_s16_u16(vget_low_u16(vsum))); vacc_hi = vaddw_s16(vacc_hi, vreinterpret_s16_u16(vget_high_u16(vsum))); vst1q_s32(b, vacc_lo); b += 4; vst1q_s32(b, vacc_hi); b += 4; } } { const uint8_t* i0 = input[0]; assert(i0 != NULL); const uint8_t* i1 = input[1]; const uint8_t* i2 = input[2]; const uint8_t* i3 = input[3]; const uint8_t* i4 = input[4]; const uint8_t* i5 = input[5]; const uint8_t* i6 = input[6]; const uint8_t* i7 = input[7]; input = (const uint8_t**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } size_t c = channels; int32_t* b = buffer; while (c >= 8) { const uint8x8_t vi0 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2 = vld1_u8(i2); i2 += 8; const uint8x8_t vi3 = vld1_u8(i3); i3 += 8; const uint8x8_t vi4 = vld1_u8(i4); i4 += 8; const uint8x8_t vi5 = vld1_u8(i5); i5 += 8; const uint8x8_t vi6 = vld1_u8(i6); i6 += 8; const uint8x8_t vi7 = vld1_u8(i7); i7 += 8; int32x4_t vacc_lo = vld1q_s32(b); b += 4; int32x4_t vacc_hi = vld1q_s32(b); b += 4; const int16x8_t vsum01 = vreinterpretq_s16_u16(vaddl_u8(vi0, vi1)); const int16x8_t vsum23 = vreinterpretq_s16_u16(vaddl_u8(vi2, vi3)); const int16x8_t vsum45 = vreinterpretq_s16_u16(vaddl_u8(vi4, vi5)); const int16x8_t vsum67 = vreinterpretq_s16_u16(vaddl_u8(vi6, vi7)); const int16x8_t vsum0123 = vaddq_s16(vsum01, vsum23); const int16x8_t vsum4567 = vaddq_s16(vsum45, vsum67); const int16x8_t vsum = vaddq_s16(vsum0123, vsum4567); vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo); float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi); vfpacc_lo = vmulq_f32(vfpacc_lo, vscale); vfpacc_hi = vmulq_f32(vfpacc_hi, vscale); vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias)); vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias)); vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point); vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi); #else int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); #endif uint8x8_t vout = vqmovun_s16(vacc); vout = vmax_u8(vout, voutput_min); vout = vmin_u8(vout, voutput_max); vst1_u8(output, vout); output += 8; c -= 8; } if (c != 0) { const uint8x8_t vi0 = vld1_u8(i0); const uint8x8_t vi1 = vld1_u8(i1); const uint8x8_t vi2 = vld1_u8(i2); const uint8x8_t vi3 = vld1_u8(i3); const uint8x8_t vi4 = vld1_u8(i4); const uint8x8_t vi5 = vld1_u8(i5); const uint8x8_t vi6 = vld1_u8(i6); const uint8x8_t vi7 = vld1_u8(i7); int32x4_t vacc_lo = vld1q_s32(b); b += 4; int32x4_t vacc_hi = vld1q_s32(b); const int16x8_t vsum01 = vreinterpretq_s16_u16(vaddl_u8(vi0, vi1)); const int16x8_t vsum23 = vreinterpretq_s16_u16(vaddl_u8(vi2, vi3)); const int16x8_t vsum45 = vreinterpretq_s16_u16(vaddl_u8(vi4, vi5)); const int16x8_t vsum67 = vreinterpretq_s16_u16(vaddl_u8(vi6, vi7)); const int16x8_t vsum0123 = vaddq_s16(vsum01, vsum23); const int16x8_t vsum4567 = vaddq_s16(vsum45, vsum67); const int16x8_t vsum = vaddq_s16(vsum0123, vsum4567); vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo); float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi); vfpacc_lo = vmulq_f32(vfpacc_lo, vscale); vfpacc_hi = vmulq_f32(vfpacc_hi, vscale); vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias)); vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias)); vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point); vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi); #else int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); #endif uint8x8_t vout = vqmovun_s16(vacc); vout = vmax_u8(vout, voutput_min); vout = vmin_u8(vout, voutput_max); if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4; vout = vext_u8(vout, vout, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2; vout = vext_u8(vout, vout, 2); } if (c & 1) { vst1_lane_u8(output, vout, 0); output += 1; } } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_qu8_avgpool_minmax_fp32_ukernel_9x__neon_c8( size_t output_pixels, size_t kernel_elements, size_t channels, const uint8_t** input, size_t input_offset, const uint8_t* zero, uint8_t* output, size_t input_increment, size_t output_increment, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max); do { const uint8_t* i0 = input[0]; assert(i0 != NULL); const uint8_t* i1 = input[1]; const uint8_t* i2 = input[2]; const uint8_t* i3 = input[3]; const uint8_t* i4 = input[4]; const uint8_t* i5 = input[5]; const uint8_t* i6 = input[6]; const uint8_t* i7 = input[7]; const uint8_t* i8 = input[8]; input = (const uint8_t**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } size_t c = channels; while (c >= 8) { const uint8x8_t vi0 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2 = vld1_u8(i2); i2 += 8; const uint8x8_t vi3 = vld1_u8(i3); i3 += 8; const uint8x8_t vi4 = vld1_u8(i4); i4 += 8; const uint8x8_t vi5 = vld1_u8(i5); i5 += 8; const uint8x8_t vi6 = vld1_u8(i6); i6 += 8; const uint8x8_t vi7 = vld1_u8(i7); i7 += 8; const uint8x8_t vi8 = vld1_u8(i8); i8 += 8; const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8); const uint16x8_t vsum23 = vaddl_u8(vi2, vi3); const uint16x8_t vsum45 = vaddl_u8(vi4, vi5); const uint16x8_t vsum67 = vaddl_u8(vi6, vi7); const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45); const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67); const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678); int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum))); int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum))); float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo); float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi); vfpacc_lo = vmulq_f32(vfpacc_lo, vscale); vfpacc_hi = vmulq_f32(vfpacc_hi, vscale); vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias)); vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias)); vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point); vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi); #else int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); #endif uint8x8_t vout = vqmovun_s16(vacc); vout = vmax_u8(vout, voutput_min); vout = vmin_u8(vout, voutput_max); vst1_u8(output, vout); output += 8; c -= 8; } if (c != 0) { const uint8x8_t vi0 = vld1_u8(i0); const uint8x8_t vi1 = vld1_u8(i1); const uint8x8_t vi2 = vld1_u8(i2); const uint8x8_t vi3 = vld1_u8(i3); const uint8x8_t vi4 = vld1_u8(i4); const uint8x8_t vi5 = vld1_u8(i5); const uint8x8_t vi6 = vld1_u8(i6); const uint8x8_t vi7 = vld1_u8(i7); const uint8x8_t vi8 = vld1_u8(i8); const uint16x8_t vsum018 = vaddw_u8(vaddl_u8(vi0, vi1), vi8); const uint16x8_t vsum23 = vaddl_u8(vi2, vi3); const uint16x8_t vsum45 = vaddl_u8(vi4, vi5); const uint16x8_t vsum67 = vaddl_u8(vi6, vi7); const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45); const uint16x8_t vsum01678 = vaddq_u16(vsum018, vsum67); const uint16x8_t vsum = vaddq_u16(vsum2345, vsum01678); int32x4_t vacc_lo = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_low_u16(vsum))); int32x4_t vacc_hi = vaddw_s16(vinit_bias, vreinterpret_s16_u16(vget_high_u16(vsum))); float32x4_t vfpacc_lo = vcvtq_f32_s32(vacc_lo); float32x4_t vfpacc_hi = vcvtq_f32_s32(vacc_hi); vfpacc_lo = vmulq_f32(vfpacc_lo, vscale); vfpacc_hi = vmulq_f32(vfpacc_hi, vscale); vacc_lo = vreinterpretq_s32_f32(vaddq_f32(vfpacc_lo, vmagic_bias)); vacc_hi = vreinterpretq_s32_f32(vaddq_f32(vfpacc_hi, vmagic_bias)); vacc_lo = vqsubq_s32(vacc_lo, vmagic_bias_less_output_zero_point); vacc_hi = vqsubq_s32(vacc_hi, vmagic_bias_less_output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc = vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi); #else int16x8_t vacc = vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)); #endif uint8x8_t vout = vqmovun_s16(vacc); vout = vmax_u8(vout, voutput_min); vout = vmin_u8(vout, voutput_max); if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout), 0); output += 4; vout = vext_u8(vout, vout, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout), 0); output += 2; vout = vext_u8(vout, vout, 2); } if (c & 1) { vst1_lane_u8(output, vout, 0); output += 1; } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_qu8_dwconv_minmax_rndnu_ukernel_25p8c__neon_mul8( size_t channels, size_t output_width, const uint8_t** input, const void* weights, uint8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const uint16x8_t vkernel_zero_point = vmovl_u8(vld1_dup_u8(params->rndnu_neon.kernel_zero_point)); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); do { const uint8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } const uint8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } const uint8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset); } const uint8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset); } const uint8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset); } const uint8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset); } const uint8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset); } const uint8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset); } const uint8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset); } const uint8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset); } const uint8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset); } const uint8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset); } const uint8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset); } const uint8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset); } const uint8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset); } const uint8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset); } const uint8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset); } const uint8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset); } input = (const uint8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 8; c -= 8) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567); uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8; const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8; const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8; const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8; const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8; const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8; const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8; const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8; const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8; const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8; const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8; const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8; const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8; const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8; const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8; const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8; const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8; const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8; const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point))); vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point))); vacc0123 = vshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #else const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #endif vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); vst1_u8(output, vout01234567); output += 8; } if XNN_UNLIKELY(c != 0) { { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const uint8x8_t vi0x01234567 = vld1_u8(i0); const uint8x8_t vk0x01234567 = vld1_u8(w); uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi1x01234567 = vld1_u8(i1); const uint8x8_t vk1x01234567 = vld1_u8((const uint8_t*) w + 8); vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567); uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi2x01234567 = vld1_u8(i2); const uint8x8_t vk2x01234567 = vld1_u8((const uint8_t*) w + 16); vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi3x01234567 = vld1_u8(i3); const uint8x8_t vk3x01234567 = vld1_u8((const uint8_t*) w + 24); vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi4x01234567 = vld1_u8(i4); const uint8x8_t vk4x01234567 = vld1_u8((const uint8_t*) w + 32); vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi5x01234567 = vld1_u8(i5); const uint8x8_t vk5x01234567 = vld1_u8((const uint8_t*) w + 40); vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi6x01234567 = vld1_u8(i6); const uint8x8_t vk6x01234567 = vld1_u8((const uint8_t*) w + 48); vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi7x01234567 = vld1_u8(i7); const uint8x8_t vk7x01234567 = vld1_u8((const uint8_t*) w + 56); vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi8x01234567 = vld1_u8(i8); const uint8x8_t vk8x01234567 = vld1_u8((const uint8_t*) w + 64); vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi9x01234567 = vld1_u8(i9); const uint8x8_t vk9x01234567 = vld1_u8((const uint8_t*) w + 72); vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi10x01234567 = vld1_u8(i10); const uint8x8_t vk10x01234567 = vld1_u8((const uint8_t*) w + 80); vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi11x01234567 = vld1_u8(i11); const uint8x8_t vk11x01234567 = vld1_u8((const uint8_t*) w + 88); vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi12x01234567 = vld1_u8(i12); const uint8x8_t vk12x01234567 = vld1_u8((const uint8_t*) w + 96); vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi13x01234567 = vld1_u8(i13); const uint8x8_t vk13x01234567 = vld1_u8((const uint8_t*) w + 104); vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi14x01234567 = vld1_u8(i14); const uint8x8_t vk14x01234567 = vld1_u8((const uint8_t*) w + 112); vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi15x01234567 = vld1_u8(i15); const uint8x8_t vk15x01234567 = vld1_u8((const uint8_t*) w + 120); vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi16x01234567 = vld1_u8(i16); const uint8x8_t vk16x01234567 = vld1_u8((const uint8_t*) w + 128); vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi17x01234567 = vld1_u8(i17); const uint8x8_t vk17x01234567 = vld1_u8((const uint8_t*) w + 136); vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi18x01234567 = vld1_u8(i18); const uint8x8_t vk18x01234567 = vld1_u8((const uint8_t*) w + 144); vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi19x01234567 = vld1_u8(i19); const uint8x8_t vk19x01234567 = vld1_u8((const uint8_t*) w + 152); vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi20x01234567 = vld1_u8(i20); const uint8x8_t vk20x01234567 = vld1_u8((const uint8_t*) w + 160); vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi21x01234567 = vld1_u8(i21); const uint8x8_t vk21x01234567 = vld1_u8((const uint8_t*) w + 168); vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi22x01234567 = vld1_u8(i22); const uint8x8_t vk22x01234567 = vld1_u8((const uint8_t*) w + 176); vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi23x01234567 = vld1_u8(i23); const uint8x8_t vk23x01234567 = vld1_u8((const uint8_t*) w + 184); vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi24x01234567 = vld1_u8(i24); const uint8x8_t vk24x01234567 = vld1_u8((const uint8_t*) w + 192); vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point))); vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point))); vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #else const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #endif vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_u8(output, vout01234567, 0); output += 1; } } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qu8_dwconv_minmax_rndnu_ukernel_9p16c__neon_mul8( size_t channels, size_t output_width, const uint8_t** input, const void* weights, uint8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const uint16x8_t vkernel_zero_point = vmovl_u8(vld1_dup_u8(params->rndnu_neon.kernel_zero_point)); const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); do { const uint8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } const uint8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } input = (const uint8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc89AB = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vaccCDEF = vld1q_s32(w); w = (const int32_t*) w + 4; const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567); uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567); uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF); uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8; const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8; const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567); vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8; const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const int8_t*) w + 8; const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8; const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const int8_t*) w + 8; vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567); vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF); vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF))); vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF))); vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point))); vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point))); vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point))); vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point))); vacc0123 = vshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vshlq_s32(vacc4567, vright_pre_shift); vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift); vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); #else const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); #endif vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16); do { int32x4_t vacc0123 = vld1q_s32(w); w = (const int32_t*) w + 4; int32x4_t vacc4567 = vld1q_s32(w); w = (const int32_t*) w + 4; const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vk0x01234567 = vld1_u8(k); k += 8; uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vk1x01234567 = vld1_u8((const void*) (k + 8)); vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567); uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vk2x01234567 = vld1_u8((const void*) (k + 24)); vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; const uint8x8_t vk3x01234567 = vld1_u8((const void*) (k + 40)); vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; const uint8x8_t vk4x01234567 = vld1_u8((const void*) (k + 56)); vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; const uint8x8_t vk5x01234567 = vld1_u8((const void*) (k + 72)); vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; const uint8x8_t vk6x01234567 = vld1_u8((const void*) (k + 88)); vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8; const uint8x8_t vk7x01234567 = vld1_u8((const void*) (k + 104)); vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8; const uint8x8_t vk8x01234567 = vld1_u8((const void*) (k + 120)); vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567))); vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point))); vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point))); vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift); vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vright_post_shift); vacc4567 = vrshlq_s32(vacc4567, vright_post_shift); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #else const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #endif vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(c >= 8) { vst1_u8(output, vout01234567); output += 8; c -= 8; } else { if (c & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (c & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (c & 1) { vst1_lane_u8(output, vout01234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qu8_f32_vcvt_ukernel__neon_x32( size_t batch, const uint8_t* input, float* output, const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const int16x8_t vminus_zero_point = vreinterpretq_s16_u32(vld1q_dup_u32((const void*) params->neon.minus_zero_point)); const float32x4_t vscale = vld1q_dup_f32(¶ms->neon.scale); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x8_t vx01234567 = vld1_u8(input); input += 8; const uint8x8_t vx89ABCDEF = vld1_u8(input); input += 8; const uint8x8_t vxGHIJKLMN = vld1_u8(input); input += 8; const uint8x8_t vxOPQRSTUV = vld1_u8(input); input += 8; const int16x8_t vhx01234567 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx01234567)); const int16x8_t vhx89ABCDEF = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx89ABCDEF)); const int16x8_t vhxGHIJKLMN = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vxGHIJKLMN)); const int16x8_t vhxOPQRSTUV = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vxOPQRSTUV)); const int32x4_t vwx0123 = vmovl_s16(vget_low_s16(vhx01234567)); const int32x4_t vwx4567 = vmovl_s16(vget_high_s16(vhx01234567)); const int32x4_t vwx89AB = vmovl_s16(vget_low_s16(vhx89ABCDEF)); const int32x4_t vwxCDEF = vmovl_s16(vget_high_s16(vhx89ABCDEF)); const int32x4_t vwxGHIJ = vmovl_s16(vget_low_s16(vhxGHIJKLMN)); const int32x4_t vwxKLMN = vmovl_s16(vget_high_s16(vhxGHIJKLMN)); const int32x4_t vwxOPQR = vmovl_s16(vget_low_s16(vhxOPQRSTUV)); const int32x4_t vwxSTUV = vmovl_s16(vget_high_s16(vhxOPQRSTUV)); float32x4_t vy0123 = vcvtq_f32_s32(vwx0123); float32x4_t vy4567 = vcvtq_f32_s32(vwx4567); float32x4_t vy89AB = vcvtq_f32_s32(vwx89AB); float32x4_t vyCDEF = vcvtq_f32_s32(vwxCDEF); float32x4_t vyGHIJ = vcvtq_f32_s32(vwxGHIJ); float32x4_t vyKLMN = vcvtq_f32_s32(vwxKLMN); float32x4_t vyOPQR = vcvtq_f32_s32(vwxOPQR); float32x4_t vySTUV = vcvtq_f32_s32(vwxSTUV); vy0123 = vmulq_f32(vy0123, vscale); vy4567 = vmulq_f32(vy4567, vscale); vy89AB = vmulq_f32(vy89AB, vscale); vyCDEF = vmulq_f32(vyCDEF, vscale); vyGHIJ = vmulq_f32(vyGHIJ, vscale); vyKLMN = vmulq_f32(vyKLMN, vscale); vyOPQR = vmulq_f32(vyOPQR, vscale); vySTUV = vmulq_f32(vySTUV, vscale); vst1q_f32(output, vy0123); output += 4; vst1q_f32(output, vy4567); output += 4; vst1q_f32(output, vy89AB); output += 4; vst1q_f32(output, vyCDEF); output += 4; vst1q_f32(output, vyGHIJ); output += 4; vst1q_f32(output, vyKLMN); output += 4; vst1q_f32(output, vyOPQR); output += 4; vst1q_f32(output, vySTUV); output += 4; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; const int16x8_t vhx = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx)); const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx)); const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx)); float32x4_t vy_lo = vcvtq_f32_s32(vwx_lo); float32x4_t vy_hi = vcvtq_f32_s32(vwx_hi); vy_lo = vmulq_f32(vy_lo, vscale); vy_hi = vmulq_f32(vy_hi, vscale); vst1q_f32(output, vy_lo); output += 4; vst1q_f32(output, vy_hi); output += 4; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); const int16x8_t vhx = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vminus_zero_point), vx)); const int32x4_t vwx_lo = vmovl_s16(vget_low_s16(vhx)); const int32x4_t vwx_hi = vmovl_s16(vget_high_s16(vhx)); float32x4_t vy = vcvtq_f32_s32(vwx_lo); vy = vmulq_f32(vy, vscale); if (batch & (4 * sizeof(uint8_t))) { vst1q_f32(output, vy); output += 4; vy = vcvtq_f32_s32(vwx_hi); vy = vmulq_f32(vy, vscale); } float32x2_t vy_lo = vget_low_f32(vy); if (batch & (2 * sizeof(uint8_t))) { vst1_f32(output, vy_lo); output += 2; vy_lo = vget_high_f32(vy); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_f32(output, vy_lo, 0); } } } void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8( size_t rows, size_t channels, const uint8_t* input, size_t input_stride, const uint8_t* zero, int32_t* buffer, uint8_t* output, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const uint8_t* i0 = input; const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 8)) { const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); vst1q_s32(b, vacc0123); b += 4; vst1q_s32(b, vacc4567); b += 4; } for (rows -= 7; rows > 7; rows -= 7) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 8)) { const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(b); int32x4_t vacc4567 = vld1q_s32(b + 4); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); vst1q_s32(b, vacc0123); b += 4; vst1q_s32(b, vacc4567); b += 4; } } i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); for (; channels >= 8; channels -= 8) { const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif // !XNN_ARCH_ARM64 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); #if XNN_ARCH_ARM64 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #else // !XNN_ARCH_ARM64 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); vst1_u8(output, vout01234567); output += 8; } if XNN_UNLIKELY(channels != 0) { { const uint8x8_t vi0x01234567 = vld1_u8(i0); const uint8x8_t vi1x01234567 = vld1_u8(i1); const uint8x8_t vi2x01234567 = vld1_u8(i2); uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); if (channels & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (channels & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (channels & 1) { vst1_lane_u8(output, vout01234567, 0); } } } } void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8( size_t rows, size_t channels, const uint8_t* input, size_t input_stride, const uint8_t* zero, uint8_t* output, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const uint8_t* i0 = input; const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); for (; channels >= 8; channels -= 8) { const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else // !XNN_ARCH_ARM64 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif // !XNN_ARCH_ARM64 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); #if XNN_ARCH_ARM64 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #else // !XNN_ARCH_ARM64 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); #endif // !XNN_ARCH_ARM64 vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); vst1_u8(output, vout01234567); output += 8; } if XNN_UNLIKELY(channels != 0) { { const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, voutput_min); vout01234567 = vmin_u8(vout01234567, voutput_max); if (channels & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (channels & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (channels & 1) { vst1_lane_u8(output, vout01234567, 0); } } } } void xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF)); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_u8(c0 + 0, vout0x0123456789ABCDEF); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); nc -= 16; } else { uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF); if (nc & 8) { vst1_u8(c0, vout0x01234567); c0 += 8; vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF); } if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_u8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567); #endif const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_u8(vout0x01234567, voutput_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_u8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_u8(c0 + 0, vout0x01234567); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_u8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 3); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride); uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride); uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; int32x4_t vacc2x0123 = vacc0x0123; int32x4_t vacc2x4567 = vacc0x4567; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 += 8; const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 += 8; const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k); const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k); const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift); vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier); vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift); vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567); uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567)); uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min); vout2x01234567 = vmax_u8(vout2x01234567, vget_low_u8(voutput_min)); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max); vout2x01234567 = vmin_u8(vout2x01234567, vget_low_u8(voutput_max)); if (nc >= 8) { vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567)); vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567)); vst1_u8(c2 + 0, vout2x01234567); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); a1 = (const uint8_t*) ((uintptr_t) a1 - kc); a2 = (const uint8_t*) ((uintptr_t) a2 - kc); nc -= 8; } else { if (nc & 4) { vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4; vst1_lane_u32((void*) c2, vreinterpret_u32_u8(vout2x01234567), 0); c2 += 4; vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2; vst1_lane_u16((void*) c2, vreinterpret_u16_u8(vout2x01234567), 0); c2 += 2; vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 2); } if (nc & 1) { vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0); vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8); vst1_lane_u8(c2, vout2x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride); uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride); uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride); uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; int32x4_t vacc1x89AB = vacc0x89AB; int32x4_t vacc1xCDEF = vacc0xCDEF; int32x4_t vacc2x0123 = vacc0x0123; int32x4_t vacc2x4567 = vacc0x4567; int32x4_t vacc2x89AB = vacc0x89AB; int32x4_t vacc2xCDEF = vacc0xCDEF; int32x4_t vacc3x0123 = vacc0x0123; int32x4_t vacc3x4567 = vacc0x4567; int32x4_t vacc3x89AB = vacc0x89AB; int32x4_t vacc3xCDEF = vacc0xCDEF; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 += 8; const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 += 8; const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t va3 = vld1_u8(a3); a3 += 8; const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k); const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k); const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k); const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); } } } } } } } const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift); vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift); vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift); vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift); vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift); vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift); vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift); vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift); vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift); vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier); vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier); vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier); vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier); vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier); vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier); vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier); vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier); vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier); vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift); vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift); vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift); vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift); vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift); vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift); vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift); vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift); vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift); vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF); int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567); int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF); int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567); int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point); vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point); vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF); uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF); uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF); uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)); int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)); int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)); int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)); int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point); vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point); vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF)); uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF)); uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF)); uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF)); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min); vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min); vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min); vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max); vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max); vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max); vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_u8(c0 + 0, vout0x0123456789ABCDEF); vst1q_u8(c1 + 0, vout1x0123456789ABCDEF); vst1q_u8(c2 + 0, vout2x0123456789ABCDEF); vst1q_u8(c3 + 0, vout3x0123456789ABCDEF); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); a1 = (const uint8_t*) ((uintptr_t) a1 - kc); a2 = (const uint8_t*) ((uintptr_t) a2 - kc); a3 = (const uint8_t*) ((uintptr_t) a3 - kc); nc -= 16; } else { uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF)); uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF)); if (nc & 8) { vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8; vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8; vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8; vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8; vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF)); vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF)); } if (nc & 4) { vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4; vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4; vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4; vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2; vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2; vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2; vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2); } if (nc & 1) { vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0); vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8); vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0); vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); } } } } } } } p -= 1 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF)); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_u8(c0 + 0, vout0x0123456789ABCDEF); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 16; } else { uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF); if (nc & 8) { vst1_u8(c0, vout0x01234567); c0 += 8; vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF); } if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_u8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); } } } } } } } p -= 1 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567); #endif const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); vout0x01234567 = vmax_u8(vout0x01234567, voutput_min); const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); vout0x01234567 = vmin_u8(vout0x01234567, voutput_max); if (nc >= 8) { vst1_u8(c0 + 0, vout0x01234567); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2; vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2); } if (nc & 1) { vst1_lane_u8(c0, vout0x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 3); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (3 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; int32x4_t vacc2x0123 = vacc0x0123; int32x4_t vacc2x4567 = vacc0x4567; size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } const uint8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset); } const uint8_t* restrict a2 = a[2]; if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset); } a += 3; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 += 8; const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 += 8; const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k); const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k); const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); } } } } } } } p -= 3 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift); vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier); vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift); vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567); uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567)); uint8x8_t vout2x01234567 = vqmovun_s16(vacc2x01234567); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min); vout2x01234567 = vmax_u8(vout2x01234567, vget_low_u8(voutput_min)); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max); vout2x01234567 = vmin_u8(vout2x01234567, vget_low_u8(voutput_max)); if (nc >= 8) { vst1_u8(c2 + 0, vout2x01234567); vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567)); vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567)); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { vst1_lane_u32((void*) c2, vreinterpret_u32_u8(vout2x01234567), 0); c2 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4; vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4; vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 4); vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1_lane_u16((void*) c2, vreinterpret_u16_u8(vout2x01234567), 0); c2 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2; vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2; vout2x01234567 = vext_u8(vout2x01234567, vout2x01234567, 2); vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1_lane_u8(c2, vout2x01234567, 0); vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8); vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->rndnu_neon.kernel_zero_point[0]); do { int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4); int32x4_t vacc1x0123 = vacc0x0123; int32x4_t vacc1x4567 = vacc0x4567; int32x4_t vacc1x89AB = vacc0x89AB; int32x4_t vacc1xCDEF = vacc0xCDEF; int32x4_t vacc2x0123 = vacc0x0123; int32x4_t vacc2x4567 = vacc0x4567; int32x4_t vacc2x89AB = vacc0x89AB; int32x4_t vacc2xCDEF = vacc0xCDEF; int32x4_t vacc3x0123 = vacc0x0123; int32x4_t vacc3x4567 = vacc0x4567; int32x4_t vacc3x89AB = vacc0x89AB; int32x4_t vacc3xCDEF = vacc0xCDEF; size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } const uint8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset); } const uint8_t* restrict a2 = a[2]; if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset); } const uint8_t* restrict a3 = a[3]; if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; while (k >= 8 * sizeof(uint8_t)) { const uint8x8_t va0 = vld1_u8(a0); a0 += 8; const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 += 8; const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 += 8; const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t va3 = vld1_u8(a3); a3 += 8; const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3); const uint8x8_t vb89ABCDEFc7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc7 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc7, vb_zero_point)); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); k -= 8 * sizeof(uint8_t); } if XNN_UNLIKELY(k != 0) { const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k); const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k); const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1)); const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k); const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2)); const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k); const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point)); const uint8x8_t vb89ABCDEFc0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); if (k >= 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point)); const uint8x8_t vb89ABCDEFc1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); if (k > 2 * sizeof(uint8_t)) { const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point)); const uint8x8_t vb89ABCDEFc2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc2 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc2, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); if (k >= 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); const uint8x8_t vb89ABCDEFc3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc3 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc3, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); if (k > 4 * sizeof(uint8_t)) { const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point)); const uint8x8_t vb89ABCDEFc4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc4 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc4, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); if (k >= 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point)); const uint8x8_t vb89ABCDEFc5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc5 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc5, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); if (k > 6 * sizeof(uint8_t)) { const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point)); const uint8x8_t vb89ABCDEFc6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8); const int16x8_t vxb89ABCDEFc6 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc6, vb_zero_point)); vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2); vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2); vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2); vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2); vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2); vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2); vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); } } } } } } } p -= 4 * sizeof(void*); } while (p != 0); // Post-accumulation work const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift); vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift); vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift); vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift); vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift); vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift); vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift); vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift); vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift); vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift); vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift); vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift); vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift); vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift); vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift); vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift); vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift); vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier); vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier); vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier); vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier); vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier); vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier); vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier); vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier); vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier); vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier); vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier); vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier); vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier); vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier); vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier); vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier); vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift); vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift); vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift); vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift); vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift); vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift); vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift); vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift); vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift); vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift); vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift); vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift); vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift); vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift); vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift); vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); #if XNN_ARCH_ARM64 int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567); int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF); int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567); int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF); int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567); int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF); int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567); int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point); vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point); vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF); uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF); uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF); uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF); #else int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)); int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)); int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)); int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)); int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)); int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)); int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)); int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)); vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point); vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point); vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point); vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point); vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point); vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point); vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point); vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point); uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF)); uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF)); uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF)); uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF)); #endif const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min); vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min); vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min); vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max); vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max); vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max); vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max); if (nc >= 16) { vst1q_u8(c3 + 0, vout3x0123456789ABCDEF); vst1q_u8(c2 + 0, vout2x0123456789ABCDEF); vst1q_u8(c1 + 0, vout1x0123456789ABCDEF); vst1q_u8(c0 + 0, vout0x0123456789ABCDEF); c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 16; } else { uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF)); uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF)); if (nc & 8) { vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8; vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8; vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8; vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8; vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF)); vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF)); } if (nc & 4) { vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4; vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4; vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4; vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4; vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4); vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4); } if (nc & 2) { vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2; vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2; vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2; vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2; vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2); vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2); } if (nc & 1) { vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8); vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0); vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8); vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point); const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->neon.b_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point)); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier); vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point); const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->neon.b_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8; const uint8x8_t vaGHIJKLMN = vld1_u8(input_a); input_a += 8; const uint8x8_t vbGHIJKLMN = vld1_u8(input_b); input_b += 8; const uint8x8_t vaOPQRSTUV = vld1_u8(input_a); input_a += 8; const uint8x8_t vbOPQRSTUV = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point)); const int16x8_t vxaGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vaGHIJKLMN, va_zero_point)); const int16x8_t vxbGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vbGHIJKLMN, vb_zero_point)); const int16x8_t vxaOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vaOPQRSTUV, va_zero_point)); const int16x8_t vxbOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vbOPQRSTUV, vb_zero_point)); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccGHIJ = vmulq_s32(vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccKLMN = vmulq_s32(vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccOPQR = vmulq_s32(vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier); int32x4_t vaccSTUV = vmulq_s32(vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier); vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier); vaccGHIJ = vmlaq_s32(vaccGHIJ, vmovl_s16(vget_low_s16(vxbGHIJKLMN)), vb_multiplier); vaccKLMN = vmlaq_s32(vaccKLMN, vmovl_s16(vget_high_s16(vxbGHIJKLMN)), vb_multiplier); vaccOPQR = vmlaq_s32(vaccOPQR, vmovl_s16(vget_low_s16(vxbOPQRSTUV)), vb_multiplier); vaccSTUV = vmlaq_s32(vaccSTUV, vmovl_s16(vget_high_s16(vxbOPQRSTUV)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift); vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift); vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift); vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point); const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier); vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max); const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point; const int32_t vb = params->neon.b_multiplier; const int32x4_t vbias = vdupq_n_s32(vxb * vb); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point); const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier); const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max); const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point; const int32_t vb = params->neon.b_multiplier; const int32x4_t vbias = vdupq_n_s32(vxb * vb); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const uint8x8_t vaGHIJKLMN = vld1_u8(input_a); input_a += 8; const uint8x8_t vaOPQRSTUV = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); const int16x8_t vxaGHIJKLMN = vreinterpretq_s16_u16(vsubl_u8(vaGHIJKLMN, va_zero_point)); const int16x8_t vxaOPQRSTUV = vreinterpretq_s16_u16(vsubl_u8(vaOPQRSTUV, va_zero_point)); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier); int32x4_t vaccGHIJ = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccKLMN = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier); int32x4_t vaccOPQR = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier); int32x4_t vaccSTUV = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); vacc89AB = vrshlq_s32(vacc89AB, vright_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift); vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift); vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift); vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift); vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point); const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point); const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point); uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier); int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier); vacc0123 = vrshlq_s32(vacc0123, vright_shift); vacc4567 = vrshlq_s32(vacc4567, vright_shift); const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vcvt_ukernel__neon_x32( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vinput_zero_point = vld1q_dup_u16(¶ms->neon.input_zero_point); const int16x8_t vmultiplier = vld1q_dup_s16(¶ms->neon.multiplier); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x16_t vx0 = vld1q_u8(input); input += 16; const uint8x16_t vx1 = vld1q_u8(input); input += 16; int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0))); int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0))); int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1))); int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1))); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); vacc0 = vqrdmulhq_s16(vacc0, vmultiplier); vacc1 = vqrdmulhq_s16(vacc1, vmultiplier); vacc2 = vqrdmulhq_s16(vacc2, vmultiplier); vacc3 = vqrdmulhq_s16(vacc3, vmultiplier); vacc0 = vqaddq_s16(vacc0, voutput_zero_point); vacc1 = vqaddq_s16(vacc1, voutput_zero_point); vacc2 = vqaddq_s16(vacc2, voutput_zero_point); vacc3 = vqaddq_s16(vacc3, voutput_zero_point); const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vacc0), vqmovun_s16(vacc1)); const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vacc2), vqmovun_s16(vacc3)); vst1q_u8(output, vy0); output += 16; vst1q_u8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); const uint8x8_t vy = vqmovun_s16(vacc); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vy, 0); } } } void xnn_qu8_vhswish_ukernel__neon_x16( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vinput_zero_point = vld1q_dup_u16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { const uint8x16_t vx0 = vld1q_u8(input); input += 16; int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0))); int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0))); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa); int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa); vin0 = vqshlq_s16(vin0, vinput_scale_div_exp); vin1 = vqshlq_s16(vin1, vinput_scale_div_exp); vin0 = vqsubq_s16(vin0, vhalf); vin1 = vqsubq_s16(vin1, vhalf); vin0 = vminq_s16(vin0, vzero); vin1 = vminq_s16(vin1, vzero); int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio); int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio); vout0 = vqdmulhq_s16(vout0, vin0); vout1 = vqdmulhq_s16(vout1, vin1); vout0 = vqaddq_s16(vout0, voutput_zero_point); vout1 = vqaddq_s16(vout1, voutput_zero_point); const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vout0), vqmovun_s16(vout1)); vst1q_u8(output, vy0); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const uint8x8_t vy = vqmovun_s16(vout); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vout); if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vy, 0); } } } void xnn_qu8_vhswish_ukernel__neon_x32( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vinput_zero_point = vld1q_dup_u16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x16_t vx0 = vld1q_u8(input); input += 16; const uint8x16_t vx1 = vld1q_u8(input); input += 16; int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0))); int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0))); int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1))); int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1))); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); int16x8_t vin0 = vqdmulhq_s16(vacc0, vinput_scale_div_mantissa); int16x8_t vin1 = vqdmulhq_s16(vacc1, vinput_scale_div_mantissa); int16x8_t vin2 = vqdmulhq_s16(vacc2, vinput_scale_div_mantissa); int16x8_t vin3 = vqdmulhq_s16(vacc3, vinput_scale_div_mantissa); vin0 = vqshlq_s16(vin0, vinput_scale_div_exp); vin1 = vqshlq_s16(vin1, vinput_scale_div_exp); vin2 = vqshlq_s16(vin2, vinput_scale_div_exp); vin3 = vqshlq_s16(vin3, vinput_scale_div_exp); vin0 = vqsubq_s16(vin0, vhalf); vin1 = vqsubq_s16(vin1, vhalf); vin2 = vqsubq_s16(vin2, vhalf); vin3 = vqsubq_s16(vin3, vhalf); vin0 = vminq_s16(vin0, vzero); vin1 = vminq_s16(vin1, vzero); vin2 = vminq_s16(vin2, vzero); vin3 = vminq_s16(vin3, vzero); int16x8_t vout0 = vqdmulhq_s16(vacc0, vscale_ratio); int16x8_t vout1 = vqdmulhq_s16(vacc1, vscale_ratio); int16x8_t vout2 = vqdmulhq_s16(vacc2, vscale_ratio); int16x8_t vout3 = vqdmulhq_s16(vacc3, vscale_ratio); vout0 = vqdmulhq_s16(vout0, vin0); vout1 = vqdmulhq_s16(vout1, vin1); vout2 = vqdmulhq_s16(vout2, vin2); vout3 = vqdmulhq_s16(vout3, vin3); vout0 = vqaddq_s16(vout0, voutput_zero_point); vout1 = vqaddq_s16(vout1, voutput_zero_point); vout2 = vqaddq_s16(vout2, voutput_zero_point); vout3 = vqaddq_s16(vout3, voutput_zero_point); const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vout0), vqmovun_s16(vout1)); const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vout2), vqmovun_s16(vout3)); vst1q_u8(output, vy0); output += 16; vst1q_u8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const uint8x8_t vy = vqmovun_s16(vout); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vout); if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vy, 0); } } } void xnn_qu8_vhswish_ukernel__neon_x8( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vinput_zero_point = vld1q_dup_u16(¶ms->neon.input_zero_point); const int16x8_t vinput_scale_div_exp = vld1q_dup_s16(¶ms->neon.input_scale_div_exp); const int16x8_t vinput_scale_div_mantissa = vld1q_dup_s16(¶ms->neon.input_scale_div_mantissa); const int16x8_t vscale_ratio = vld1q_dup_s16(¶ms->neon.scale_ratio); const int16x8_t vhalf = vdupq_n_s16(16384); const int16x8_t vzero = vdupq_n_s16(0); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); const uint8x8_t vy = vqmovun_s16(vout); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); vacc = vshlq_n_s16(vacc, 7); int16x8_t vin = vqdmulhq_s16(vacc, vinput_scale_div_mantissa); vin = vqshlq_s16(vin, vinput_scale_div_exp); vin = vqsubq_s16(vin, vhalf); vin = vminq_s16(vin, vzero); int16x8_t vout = vqdmulhq_s16(vacc, vscale_ratio); vout = vqdmulhq_s16(vout, vin); vout = vqaddq_s16(vout, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vout); if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vy, 0); } } } void xnn_qu8_vlrelu_ukernel__neon_x32( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint16x8_t vinput_zero_point = vld1q_dup_u16(¶ms->neon.input_zero_point); const int16x8_t vpositive_multiplier = vld1q_dup_s16(¶ms->neon.positive_multiplier); const int16x8_t vnegative_multiplier = vld1q_dup_s16(¶ms->neon.negative_multiplier); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const uint8x16_t vx0 = vld1q_u8(input); input += 16; const uint8x16_t vx1 = vld1q_u8(input); input += 16; int16x8_t vacc0 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx0))); int16x8_t vacc1 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx0))); int16x8_t vacc2 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_low_u8(vx1))); int16x8_t vacc3 = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vget_high_u8(vx1))); const uint16x8_t vmask0 = vcltq_s16(vacc0, vmovq_n_s16(0)); const uint16x8_t vmask1 = vcltq_s16(vacc1, vmovq_n_s16(0)); const uint16x8_t vmask2 = vcltq_s16(vacc2, vmovq_n_s16(0)); const uint16x8_t vmask3 = vcltq_s16(vacc3, vmovq_n_s16(0)); vacc0 = vshlq_n_s16(vacc0, 7); vacc1 = vshlq_n_s16(vacc1, 7); vacc2 = vshlq_n_s16(vacc2, 7); vacc3 = vshlq_n_s16(vacc3, 7); const int16x8_t vmultiplier0 = vbslq_s16(vmask0, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier1 = vbslq_s16(vmask1, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier2 = vbslq_s16(vmask2, vpositive_multiplier, vnegative_multiplier); const int16x8_t vmultiplier3 = vbslq_s16(vmask3, vpositive_multiplier, vnegative_multiplier); vacc0 = vqrdmulhq_s16(vacc0, vmultiplier0); vacc1 = vqrdmulhq_s16(vacc1, vmultiplier1); vacc2 = vqrdmulhq_s16(vacc2, vmultiplier2); vacc3 = vqrdmulhq_s16(vacc3, vmultiplier3); vacc0 = vqaddq_s16(vacc0, voutput_zero_point); vacc1 = vqaddq_s16(vacc1, voutput_zero_point); vacc2 = vqaddq_s16(vacc2, voutput_zero_point); vacc3 = vqaddq_s16(vacc3, voutput_zero_point); const uint8x16_t vy0 = vcombine_u8(vqmovun_s16(vacc0), vqmovun_s16(vacc1)); const uint8x16_t vy1 = vcombine_u8(vqmovun_s16(vacc2), vqmovun_s16(vacc3)); vst1q_u8(output, vy0); output += 16; vst1q_u8(output, vy1); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const uint8x8_t vx = vld1_u8(input); input += 8; int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0)); vacc = vshlq_n_s16(vacc, 7); const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); const uint8x8_t vy = vqmovun_s16(vacc); vst1_u8(output, vy); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const uint8x8_t vx = vld1_u8(input); int16x8_t vacc = vreinterpretq_s16_u16(vsubw_u8(vinput_zero_point, vx)); const uint16x8_t vmask = vcltq_s16(vacc, vmovq_n_s16(0)); vacc = vshlq_n_s16(vacc, 7); const int16x8_t vmultiplier = vbslq_s16(vmask, vpositive_multiplier, vnegative_multiplier); vacc = vqrdmulhq_s16(vacc, vmultiplier); vacc = vqaddq_s16(vacc, voutput_zero_point); uint8x8_t vy = vqmovun_s16(vacc); if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vy), 0); output += 4; vy = vext_u8(vy, vy, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vy), 0); output += 2; vy = vext_u8(vy, vy, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vy, 0); } } } void xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(params->rndnu_neon.a_zero_point); const uint8x8_t vb_zero_point = vld1_dup_u8(params->rndnu_neon.b_zero_point); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point)); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567)); int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb89ABCDEF)); int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb89ABCDEF)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); #if XNN_ARCH_ARM64 uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); #else uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); #endif vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb01234567)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb01234567)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const uint8x8_t va_zero_point = vld1_dup_u8(params->rndnu_neon.a_zero_point); const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); const uint8x8_t vb = vld1_dup_u8(input_b); const uint8x8_t vb_zero_point = vld1_dup_u8(params->rndnu_neon.b_zero_point); const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point)); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point)); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb)); int32x4_t vacc89AB = vmull_s16(vget_low_s16(vxa89ABCDEF), vget_low_s16(vxb)); int32x4_t vaccCDEF = vmull_s16(vget_high_s16(vxa89ABCDEF), vget_high_s16(vxb)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); #if XNN_ARCH_ARM64 uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); #else uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); #endif vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); vst1q_u8(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(batch != 0) { do { const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8; const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point)); int32x4_t vacc0123 = vmull_s16(vget_low_s16(vxa01234567), vget_low_s16(vxb)); int32x4_t vacc4567 = vmull_s16(vget_high_s16(vxa01234567), vget_high_s16(vxb)); vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); #if XNN_ARCH_ARM64 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); #else int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); #endif vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { vst1_u8(output, vout01234567); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; vout01234567 = vext_u8(vout01234567, vout01234567, 4); } if (batch & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; vout01234567 = vext_u8(vout01234567, vout01234567, 2); } if (batch & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vout01234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_s8_ibilinear_ukernel__neon_c16( size_t output_pixels, size_t channels, const int8_t** restrict input, size_t input_offset, const int16_t* restrict weights, int8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset); const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset); const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset); const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset); input += 4; #if XNN_ARCH_ARM64 const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1; #else const int16x4_t valphah = vld1_dup_s16(weights); weights += 1; #endif const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1; size_t c = channels; for (; c >= 16 * sizeof(int8_t); c -= 16 * sizeof(int8_t)) { const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8; const int8x8_t vtl89ABCDEF = vld1_s8(i0); i0 += 8; const int8x8_t vtr89ABCDEF = vld1_s8(i1); i1 += 8; const int8x8_t vbl89ABCDEF = vld1_s8(i2); i2 += 8; const int8x8_t vbr89ABCDEF = vld1_s8(i3); i3 += 8; const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567); const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567); const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567); const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567); const int16x8_t vtd89ABCDEF = vsubl_s8(vtr89ABCDEF, vtl89ABCDEF); const int16x8_t vbd89ABCDEF = vsubl_s8(vbr89ABCDEF, vbl89ABCDEF); const int16x8_t vdl89ABCDEF = vsubl_s8(vbl89ABCDEF, vtl89ABCDEF); const int16x8_t vxtl89ABCDEF = vmovl_s8(vtl89ABCDEF); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); const int16x8_t vdd89ABCDEF = vsubq_s16(vbd89ABCDEF, vtd89ABCDEF); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), vget_low_s16(valphah)); const int32x4_t vtCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vtd89ABCDEF, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), vget_low_s16(valphah)); const int32x4_t vdCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vdd89ABCDEF, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), valphah); const int32x4_t vtCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vget_high_s16(vtd89ABCDEF), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), valphah); const int32x4_t vdCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vget_high_s16(vdd89ABCDEF), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); const int32x4_t vacc89AB = vmlaq_s32(vshlq_n_s32(vt89AB, 11), vd89AB, valphav); const int32x4_t vaccCDEF = vmlaq_s32(vshlq_n_s32(vtCDEF, 11), vdCDEF, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); const int16x8_t vacc89ABCDEF = vuzp2q_s16(vreinterpretq_s16_s32(vacc89AB), vreinterpretq_s16_s32(vaccCDEF)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); const int16x8_t vacc89ABCDEF = vcombine_s16(vshrn_n_s32(vacc89AB, 16), vshrn_n_s32(vaccCDEF, 16)); #endif // !XNN_ARCH_ARM64 const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6); const int8x8_t vo89ABCDEF = vrshrn_n_s16(vacc89ABCDEF, 6); vst1_s8(output, vo01234567); output += 8; vst1_s8(output, vo89ABCDEF); output += 8; } for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) { const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8; const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567); const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567); const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567); const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6); vst1_s8(output, vo01234567); output += 8; } if XNN_UNLIKELY(c != 0) { const int8x8_t vtl01234567 = vld1_s8(i0); const int8x8_t vtr01234567 = vld1_s8(i1); const int8x8_t vbl01234567 = vld1_s8(i2); const int8x8_t vbr01234567 = vld1_s8(i3); const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567); const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567); const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567); const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6); if (c & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vo01234567), 0); output += 4; vo01234567 = vext_s8(vo01234567, vo01234567, 4); } if (c & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vo01234567), 0); output += 2; vo01234567 = vext_s8(vo01234567, vo01234567, 2); } if (c & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vo01234567, 0); output += 1; } } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_s8_ibilinear_ukernel__neon_c8( size_t output_pixels, size_t channels, const int8_t** restrict input, size_t input_offset, const int16_t* restrict weights, int8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset); const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset); const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset); const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset); input += 4; #if XNN_ARCH_ARM64 const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1; #else const int16x4_t valphah = vld1_dup_s16(weights); weights += 1; #endif const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1; size_t c = channels; for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) { const int8x8_t vtl01234567 = vld1_s8(i0); i0 += 8; const int8x8_t vtr01234567 = vld1_s8(i1); i1 += 8; const int8x8_t vbl01234567 = vld1_s8(i2); i2 += 8; const int8x8_t vbr01234567 = vld1_s8(i3); i3 += 8; const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567); const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567); const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567); const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 const int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6); vst1_s8(output, vo01234567); output += 8; } if XNN_UNLIKELY(c != 0) { const int8x8_t vtl01234567 = vld1_s8(i0); const int8x8_t vtr01234567 = vld1_s8(i1); const int8x8_t vbl01234567 = vld1_s8(i2); const int8x8_t vbr01234567 = vld1_s8(i3); const int16x8_t vtd01234567 = vsubl_s8(vtr01234567, vtl01234567); const int16x8_t vbd01234567 = vsubl_s8(vbr01234567, vbl01234567); const int16x8_t vdl01234567 = vsubl_s8(vbl01234567, vtl01234567); const int16x8_t vxtl01234567 = vmovl_s8(vtl01234567); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 int8x8_t vo01234567 = vrshrn_n_s16(vacc01234567, 6); if (c & (4 * sizeof(int8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vo01234567), 0); output += 4; vo01234567 = vext_s8(vo01234567, vo01234567, 4); } if (c & (2 * sizeof(int8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vo01234567), 0); output += 2; vo01234567 = vext_s8(vo01234567, vo01234567, 2); } if (c & (1 * sizeof(int8_t))) { vst1_lane_s8(output, vo01234567, 0); output += 1; } } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16( size_t output_pixels, size_t kernel_elements, size_t channels, const int8_t** input, size_t input_offset, int8_t* output, size_t input_increment, size_t output_increment, const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.max); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.min); do { int8_t* o = output; { const int8_t* i0 = *input++; const int8_t* i1 = *input++; const int8_t* i2 = *input++; const int8_t* i3 = *input++; const int8_t* i4 = *input++; const int8_t* i5 = *input++; const int8_t* i6 = *input++; const int8_t* i7 = *input++; const int8_t* i8 = *input++; i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 16; c -= 16) { const int8x16_t vi0 = vld1q_s8(i0); i0 += 16; const int8x16_t vi1 = vld1q_s8(i1); i1 += 16; const int8x16_t vi2 = vld1q_s8(i2); i2 += 16; const int8x16_t vi3 = vld1q_s8(i3); i3 += 16; const int8x16_t vi4 = vld1q_s8(i4); i4 += 16; const int8x16_t vi5 = vld1q_s8(i5); i5 += 16; const int8x16_t vi6 = vld1q_s8(i6); i6 += 16; const int8x16_t vi7 = vld1q_s8(i7); i7 += 16; const int8x16_t vi8 = vld1q_s8(i8); i8 += 16; const int8x16_t vmax018 = vmaxq_s8(vmaxq_s8(vi0, vi1), vi8); const int8x16_t vmax23 = vmaxq_s8(vi2, vi3); const int8x16_t vmax45 = vmaxq_s8(vi4, vi5); const int8x16_t vmax67 = vmaxq_s8(vi6, vi7); const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45); const int8x16_t vmax01678 = vmaxq_s8(vmax018, vmax67); int8x16_t vout = vmaxq_s8(vmax2345, vmax01678); vout = vmaxq_s8(vout, voutput_min); vout = vminq_s8(vout, voutput_max); vst1q_s8(o, vout); o += 16; } if (c != 0) { const int8x16_t vi0 = vld1q_s8(i0); const int8x16_t vi1 = vld1q_s8(i1); const int8x16_t vi2 = vld1q_s8(i2); const int8x16_t vi3 = vld1q_s8(i3); const int8x16_t vi4 = vld1q_s8(i4); const int8x16_t vi5 = vld1q_s8(i5); const int8x16_t vi6 = vld1q_s8(i6); const int8x16_t vi7 = vld1q_s8(i7); const int8x16_t vi8 = vld1q_s8(i8); const int8x16_t vmax018 = vmaxq_s8(vmaxq_s8(vi0, vi1), vi8); const int8x16_t vmax23 = vmaxq_s8(vi2, vi3); const int8x16_t vmax45 = vmaxq_s8(vi4, vi5); const int8x16_t vmax67 = vmaxq_s8(vi6, vi7); const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45); const int8x16_t vmax01678 = vmaxq_s8(vmax018, vmax67); int8x16_t vout = vmaxq_s8(vmax2345, vmax01678); vout = vmaxq_s8(vout, voutput_min); vout = vminq_s8(vout, voutput_max); int8x8_t vout_lo = vget_low_s8(vout); if (c & 8) { vst1_s8(o, vout_lo); o += 8; vout_lo = vget_high_s8(vout); } if (c & 4) { vst1_lane_u32((void*) o, vreinterpret_u32_s8(vout_lo), 0); o += 4; vout_lo = vext_s8(vout_lo, vout_lo, 4); } if (c & 2) { vst1_lane_u16((void*) o, vreinterpret_u16_s8(vout_lo), 0); o += 2; vout_lo = vext_s8(vout_lo, vout_lo, 2); } if (c & 1) { vst1_lane_s8(o, vout_lo, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const int8_t* i0 = *input++; const int8_t* i1 = *input++; const int8_t* i2 = *input++; const int8_t* i3 = *input++; const int8_t* i4 = *input++; const int8_t* i5 = *input++; const int8_t* i6 = *input++; const int8_t* i7 = *input++; i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 16; c -= 16) { const int8x16_t vi0 = vld1q_s8(i0); i0 += 16; const int8x16_t vi1 = vld1q_s8(i1); i1 += 16; const int8x16_t vi2 = vld1q_s8(i2); i2 += 16; const int8x16_t vi3 = vld1q_s8(i3); i3 += 16; const int8x16_t vi4 = vld1q_s8(i4); i4 += 16; const int8x16_t vi5 = vld1q_s8(i5); i5 += 16; const int8x16_t vi6 = vld1q_s8(i6); i6 += 16; const int8x16_t vi7 = vld1q_s8(i7); i7 += 16; const int8x16_t vo = vld1q_s8(o); const int8x16_t vmax01 = vmaxq_s8(vmaxq_s8(vi0, vi1), vo); const int8x16_t vmax23 = vmaxq_s8(vi2, vi3); const int8x16_t vmax45 = vmaxq_s8(vi4, vi5); const int8x16_t vmax67 = vmaxq_s8(vi6, vi7); const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45); const int8x16_t vmax0167 = vmaxq_s8(vmax01, vmax67); int8x16_t vout = vmaxq_s8(vmax2345, vmax0167); vout = vmaxq_s8(vout, voutput_min); vout = vminq_s8(vout, voutput_max); vst1q_s8(o, vout); o += 16; } if (c != 0) { const int8x16_t vi0 = vld1q_s8(i0); const int8x16_t vi1 = vld1q_s8(i1); const int8x16_t vi2 = vld1q_s8(i2); const int8x16_t vi3 = vld1q_s8(i3); const int8x16_t vi4 = vld1q_s8(i4); const int8x16_t vi5 = vld1q_s8(i5); const int8x16_t vi6 = vld1q_s8(i6); const int8x16_t vi7 = vld1q_s8(i7); const int8x16_t vo = vld1q_s8(o); const int8x16_t vmax01 = vmaxq_s8(vmaxq_s8(vi0, vi1), vo); const int8x16_t vmax23 = vmaxq_s8(vi2, vi3); const int8x16_t vmax45 = vmaxq_s8(vi4, vi5); const int8x16_t vmax67 = vmaxq_s8(vi6, vi7); const int8x16_t vmax2345 = vmaxq_s8(vmax23, vmax45); const int8x16_t vmax0167 = vmaxq_s8(vmax01, vmax67); int8x16_t vout = vmaxq_s8(vmax2345, vmax0167); vout = vmaxq_s8(vout, voutput_min); vout = vminq_s8(vout, voutput_max); int8x8_t vout_lo = vget_low_s8(vout); if (c & 8) { vst1_s8(o, vout_lo); o += 8; vout_lo = vget_high_s8(vout); } if (c & 4) { vst1_lane_u32((void*) o, vreinterpret_u32_s8(vout_lo), 0); o += 4; vout_lo = vext_s8(vout_lo, vout_lo, 4); } if (c & 2) { vst1_lane_u16((void*) o, vreinterpret_u16_s8(vout_lo), 0); o += 2; vout_lo = vext_s8(vout_lo, vout_lo, 2); } if (c & 1) { vst1_lane_s8(o, vout_lo, 0); o += 1; } } } input = (const int8_t**) ((uintptr_t) input + input_increment); output = (int8_t*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_s8_vclamp_ukernel__neon_x64( size_t batch, const int8_t* input, int8_t* output, const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.max); const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.min); for (; batch >= 64; batch -= 64) { int8x16_t vacc0 = vld1q_s8(input); input += 16; int8x16_t vacc1 = vld1q_s8(input); input += 16; int8x16_t vacc2 = vld1q_s8(input); input += 16; int8x16_t vacc3 = vld1q_s8(input); input += 16; vacc0 = vmaxq_s8(vacc0, voutput_min); vacc1 = vmaxq_s8(vacc1, voutput_min); vacc2 = vmaxq_s8(vacc2, voutput_min); vacc3 = vmaxq_s8(vacc3, voutput_min); vacc0 = vminq_s8(vacc0, voutput_max); vacc1 = vminq_s8(vacc1, voutput_max); vacc2 = vminq_s8(vacc2, voutput_max); vacc3 = vminq_s8(vacc3, voutput_max); vst1q_s8(output, vacc0); output += 16; vst1q_s8(output, vacc1); output += 16; vst1q_s8(output, vacc2); output += 16; vst1q_s8(output, vacc3); output += 16; } for (; batch >= 8; batch -= 8) { int8x8_t vacc = vld1_s8(input); input += 8; vacc = vmin_s8(vacc, vget_low_s8(voutput_max)); vacc = vmax_s8(vacc, vget_low_s8(voutput_min)); vst1_s8(output, vacc); output += 8; } if XNN_UNLIKELY(batch != 0) { int8x8_t vacc = vld1_s8(input); input += 8; vacc = vmin_s8(vacc, vget_low_s8(voutput_max)); vacc = vmax_s8(vacc, vget_low_s8(voutput_min)); if (batch & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_s8(vacc), 0); output += 4; vacc = vext_s8(vacc, vacc, 4); } if (batch & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_s8(vacc), 0); output += 2; vacc = vext_s8(vacc, vacc, 2); } if (batch & 1) { vst1_lane_s8(output, vacc, 0); } } } void xnn_u8_ibilinear_ukernel__neon_c16( size_t output_pixels, size_t channels, const uint8_t** restrict input, size_t input_offset, const int16_t* restrict weights, uint8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset); const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset); const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset); const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset); input += 4; #if XNN_ARCH_ARM64 const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1; #else const int16x4_t valphah = vld1_dup_s16(weights); weights += 1; #endif const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1; size_t c = channels; for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) { const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8; const uint8x8_t vtl89ABCDEF = vld1_u8(i0); i0 += 8; const uint8x8_t vtr89ABCDEF = vld1_u8(i1); i1 += 8; const uint8x8_t vbl89ABCDEF = vld1_u8(i2); i2 += 8; const uint8x8_t vbr89ABCDEF = vld1_u8(i3); i3 += 8; const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567)); const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567)); const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567)); const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567)); const int16x8_t vtd89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vtr89ABCDEF, vtl89ABCDEF)); const int16x8_t vbd89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vbr89ABCDEF, vbl89ABCDEF)); const int16x8_t vdl89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vbl89ABCDEF, vtl89ABCDEF)); const int16x8_t vxtl89ABCDEF = vreinterpretq_s16_u16(vmovl_u8(vtl89ABCDEF)); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); const int16x8_t vdd89ABCDEF = vsubq_s16(vbd89ABCDEF, vtd89ABCDEF); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), vget_low_s16(valphah)); const int32x4_t vtCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vtd89ABCDEF, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), vget_low_s16(valphah)); const int32x4_t vdCDEF = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vdd89ABCDEF, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vt89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl89ABCDEF), 11), vget_low_s16(vtd89ABCDEF), valphah); const int32x4_t vtCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl89ABCDEF), 11), vget_high_s16(vtd89ABCDEF), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); const int32x4_t vd89AB = vmlal_s16(vshll_n_s16(vget_low_s16(vdl89ABCDEF), 11), vget_low_s16(vdd89ABCDEF), valphah); const int32x4_t vdCDEF = vmlal_s16(vshll_n_s16(vget_high_s16(vdl89ABCDEF), 11), vget_high_s16(vdd89ABCDEF), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); const int32x4_t vacc89AB = vmlaq_s32(vshlq_n_s32(vt89AB, 11), vd89AB, valphav); const int32x4_t vaccCDEF = vmlaq_s32(vshlq_n_s32(vtCDEF, 11), vdCDEF, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); const int16x8_t vacc89ABCDEF = vuzp2q_s16(vreinterpretq_s16_s32(vacc89AB), vreinterpretq_s16_s32(vaccCDEF)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); const int16x8_t vacc89ABCDEF = vcombine_s16(vshrn_n_s32(vacc89AB, 16), vshrn_n_s32(vaccCDEF, 16)); #endif // !XNN_ARCH_ARM64 const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6); const uint8x8_t vo89ABCDEF = vrshrn_n_u16(vreinterpretq_u16_s16(vacc89ABCDEF), 6); vst1_u8(output, vo01234567); output += 8; vst1_u8(output, vo89ABCDEF); output += 8; } for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) { const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8; const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567)); const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567)); const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567)); const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567)); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6); vst1_u8(output, vo01234567); output += 8; } if XNN_UNLIKELY(c != 0) { const uint8x8_t vtl01234567 = vld1_u8(i0); const uint8x8_t vtr01234567 = vld1_u8(i1); const uint8x8_t vbl01234567 = vld1_u8(i2); const uint8x8_t vbr01234567 = vld1_u8(i3); const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567)); const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567)); const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567)); const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567)); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6); if (c & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vo01234567), 0); output += 4; vo01234567 = vext_u8(vo01234567, vo01234567, 4); } if (c & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vo01234567), 0); output += 2; vo01234567 = vext_u8(vo01234567, vo01234567, 2); } if (c & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vo01234567, 0); output += 1; } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_u8_ibilinear_ukernel__neon_c8( size_t output_pixels, size_t channels, const uint8_t** restrict input, size_t input_offset, const int16_t* restrict weights, uint8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset); const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset); const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset); const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset); input += 4; #if XNN_ARCH_ARM64 const int16x8_t valphah = vld1q_dup_s16(weights); weights += 1; #else const int16x4_t valphah = vld1_dup_s16(weights); weights += 1; #endif const int32x4_t valphav = vmovl_s16(vld1_dup_s16(weights)); weights += 1; size_t c = channels; for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) { const uint8x8_t vtl01234567 = vld1_u8(i0); i0 += 8; const uint8x8_t vtr01234567 = vld1_u8(i1); i1 += 8; const uint8x8_t vbl01234567 = vld1_u8(i2); i2 += 8; const uint8x8_t vbr01234567 = vld1_u8(i3); i3 += 8; const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567)); const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567)); const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567)); const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567)); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 const uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6); vst1_u8(output, vo01234567); output += 8; } if XNN_UNLIKELY(c != 0) { const uint8x8_t vtl01234567 = vld1_u8(i0); const uint8x8_t vtr01234567 = vld1_u8(i1); const uint8x8_t vbl01234567 = vld1_u8(i2); const uint8x8_t vbr01234567 = vld1_u8(i3); const int16x8_t vtd01234567 = vreinterpretq_s16_u16(vsubl_u8(vtr01234567, vtl01234567)); const int16x8_t vbd01234567 = vreinterpretq_s16_u16(vsubl_u8(vbr01234567, vbl01234567)); const int16x8_t vdl01234567 = vreinterpretq_s16_u16(vsubl_u8(vbl01234567, vtl01234567)); const int16x8_t vxtl01234567 = vreinterpretq_s16_u16(vmovl_u8(vtl01234567)); const int16x8_t vdd01234567 = vsubq_s16(vbd01234567, vtd01234567); #if XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), vget_low_s16(valphah)); const int32x4_t vt4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vtd01234567, valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), vget_low_s16(valphah)); const int32x4_t vd4567 = vmlal_high_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vdd01234567, valphah); #else // !XNN_ARCH_ARM64 const int32x4_t vt0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vxtl01234567), 11), vget_low_s16(vtd01234567), valphah); const int32x4_t vt4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vxtl01234567), 11), vget_high_s16(vtd01234567), valphah); const int32x4_t vd0123 = vmlal_s16(vshll_n_s16(vget_low_s16(vdl01234567), 11), vget_low_s16(vdd01234567), valphah); const int32x4_t vd4567 = vmlal_s16(vshll_n_s16(vget_high_s16(vdl01234567), 11), vget_high_s16(vdd01234567), valphah); #endif // !XNN_ARCH_ARM64 const int32x4_t vacc0123 = vmlaq_s32(vshlq_n_s32(vt0123, 11), vd0123, valphav); const int32x4_t vacc4567 = vmlaq_s32(vshlq_n_s32(vt4567, 11), vd4567, valphav); #if XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vuzp2q_s16(vreinterpretq_s16_s32(vacc0123), vreinterpretq_s16_s32(vacc4567)); #else // !XNN_ARCH_ARM64 const int16x8_t vacc01234567 = vcombine_s16(vshrn_n_s32(vacc0123, 16), vshrn_n_s32(vacc4567, 16)); #endif // !XNN_ARCH_ARM64 uint8x8_t vo01234567 = vrshrn_n_u16(vreinterpretq_u16_s16(vacc01234567), 6); if (c & (4 * sizeof(uint8_t))) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vo01234567), 0); output += 4; vo01234567 = vext_u8(vo01234567, vo01234567, 4); } if (c & (2 * sizeof(uint8_t))) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vo01234567), 0); output += 2; vo01234567 = vext_u8(vo01234567, vo01234567, 2); } if (c & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vo01234567, 0); output += 1; } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16( size_t output_pixels, size_t kernel_elements, size_t channels, const uint8_t** input, size_t input_offset, uint8_t* output, size_t input_increment, size_t output_increment, const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.max); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.min); do { uint8_t* o = output; { const uint8_t* i0 = *input++; const uint8_t* i1 = *input++; const uint8_t* i2 = *input++; const uint8_t* i3 = *input++; const uint8_t* i4 = *input++; const uint8_t* i5 = *input++; const uint8_t* i6 = *input++; const uint8_t* i7 = *input++; const uint8_t* i8 = *input++; i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 16; c -= 16) { const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16; const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16; const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16; const uint8x16_t vi3 = vld1q_u8(i3); i3 += 16; const uint8x16_t vi4 = vld1q_u8(i4); i4 += 16; const uint8x16_t vi5 = vld1q_u8(i5); i5 += 16; const uint8x16_t vi6 = vld1q_u8(i6); i6 += 16; const uint8x16_t vi7 = vld1q_u8(i7); i7 += 16; const uint8x16_t vi8 = vld1q_u8(i8); i8 += 16; const uint8x16_t vmax018 = vmaxq_u8(vmaxq_u8(vi0, vi1), vi8); const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3); const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5); const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7); const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45); const uint8x16_t vmax01678 = vmaxq_u8(vmax018, vmax67); const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax01678); const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min); vst1q_u8(o, vout); o += 16; } if (c != 0) { const uint8x16_t vi0 = vld1q_u8(i0); const uint8x16_t vi1 = vld1q_u8(i1); const uint8x16_t vi2 = vld1q_u8(i2); const uint8x16_t vi3 = vld1q_u8(i3); const uint8x16_t vi4 = vld1q_u8(i4); const uint8x16_t vi5 = vld1q_u8(i5); const uint8x16_t vi6 = vld1q_u8(i6); const uint8x16_t vi7 = vld1q_u8(i7); const uint8x16_t vi8 = vld1q_u8(i8); const uint8x16_t vmax018 = vmaxq_u8(vmaxq_u8(vi0, vi1), vi8); const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3); const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5); const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7); const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45); const uint8x16_t vmax01678 = vmaxq_u8(vmax018, vmax67); const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax01678); const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min); uint8x8_t vout_lo = vget_low_u8(vout); if (c & 8) { vst1_u8(o, vout_lo); o += 8; vout_lo = vget_high_u8(vout); } if (c & 4) { vst1_lane_u32((void*) o, vreinterpret_u32_u8(vout_lo), 0); o += 4; vout_lo = vext_u8(vout_lo, vout_lo, 4); } if (c & 2) { vst1_lane_u16((void*) o, vreinterpret_u16_u8(vout_lo), 0); o += 2; vout_lo = vext_u8(vout_lo, vout_lo, 2); } if (c & 1) { vst1_lane_u8(o, vout_lo, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const uint8_t* i0 = *input++; const uint8_t* i1 = *input++; const uint8_t* i2 = *input++; const uint8_t* i3 = *input++; const uint8_t* i4 = *input++; const uint8_t* i5 = *input++; const uint8_t* i6 = *input++; const uint8_t* i7 = *input++; i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 16; c -= 16) { const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16; const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16; const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16; const uint8x16_t vi3 = vld1q_u8(i3); i3 += 16; const uint8x16_t vi4 = vld1q_u8(i4); i4 += 16; const uint8x16_t vi5 = vld1q_u8(i5); i5 += 16; const uint8x16_t vi6 = vld1q_u8(i6); i6 += 16; const uint8x16_t vi7 = vld1q_u8(i7); i7 += 16; const uint8x16_t vo = vld1q_u8(o); const uint8x16_t vmax01 = vmaxq_u8(vmaxq_u8(vi0, vi1), vo); const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3); const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5); const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7); const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45); const uint8x16_t vmax0167 = vmaxq_u8(vmax01, vmax67); const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax0167); const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min); vst1q_u8(o, vout); o += 16; } if (c != 0) { const uint8x16_t vi0 = vld1q_u8(i0); const uint8x16_t vi1 = vld1q_u8(i1); const uint8x16_t vi2 = vld1q_u8(i2); const uint8x16_t vi3 = vld1q_u8(i3); const uint8x16_t vi4 = vld1q_u8(i4); const uint8x16_t vi5 = vld1q_u8(i5); const uint8x16_t vi6 = vld1q_u8(i6); const uint8x16_t vi7 = vld1q_u8(i7); const uint8x16_t vo = vld1q_u8(o); const uint8x16_t vmax01 = vmaxq_u8(vmaxq_u8(vi0, vi1), vo); const uint8x16_t vmax23 = vmaxq_u8(vi2, vi3); const uint8x16_t vmax45 = vmaxq_u8(vi4, vi5); const uint8x16_t vmax67 = vmaxq_u8(vi6, vi7); const uint8x16_t vmax2345 = vmaxq_u8(vmax23, vmax45); const uint8x16_t vmax0167 = vmaxq_u8(vmax01, vmax67); const uint8x16_t vmax = vmaxq_u8(vmax2345, vmax0167); const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min); uint8x8_t vout_lo = vget_low_u8(vout); if (c & 8) { vst1_u8(o, vout_lo); o += 8; vout_lo = vget_high_u8(vout); } if (c & 4) { vst1_lane_u32((void*) o, vreinterpret_u32_u8(vout_lo), 0); o += 4; vout_lo = vext_u8(vout_lo, vout_lo, 4); } if (c & 2) { vst1_lane_u16((void*) o, vreinterpret_u16_u8(vout_lo), 0); o += 2; vout_lo = vext_u8(vout_lo, vout_lo, 2); } if (c & 1) { vst1_lane_u8(o, vout_lo, 0); o += 1; } } } input = (const uint8_t**) ((uintptr_t) input + input_increment); output = (uint8_t*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_u8_rmax_ukernel__neon( size_t batch, const uint8_t* input, uint8_t* output) { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); if XNN_LIKELY(batch >= 16) { uint8x16_t vmax = vmovq_n_u8(0); do { const uint8x16_t vx = vld1q_u8(input); input += 16; vmax = vmaxq_u8(vmax, vx); batch -= 16; } while (batch >= 16); if (batch != 0) { const size_t x_increment = batch - 16; input = (const uint8_t*) ((uintptr_t) input + x_increment); const uint8x16_t vx = vld1q_u8(input); vmax = vmaxq_u8(vmax, vx); } uint8x8_t vmax8 = vmax_u8(vget_low_u8(vmax), vget_high_u8(vmax)); const uint8x8_t vmax4 = vpmax_u8(vmax8, vmax8); const uint8x8_t vmax2 = vpmax_u8(vmax4, vmax4); const uint8x8_t vmax1 = vpmax_u8(vmax2, vmax2); vst1_lane_u8(output, vmax1, 0); } else { uint8x8_t vmax = vmov_n_u8(0); do { const uint8x8_t vx = vld1_dup_u8(input); input += 1; vmax = vmax_u8(vmax, vx); } while (--batch != 0); vst1_lane_u8(output, vmax, 0); } } void xnn_u8_vclamp_ukernel__neon_x64( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.max); const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.min); for (; batch >= 64; batch -= 64) { uint8x16_t vacc0 = vld1q_u8(input); input += 16; uint8x16_t vacc1 = vld1q_u8(input); input += 16; uint8x16_t vacc2 = vld1q_u8(input); input += 16; uint8x16_t vacc3 = vld1q_u8(input); input += 16; vacc0 = vmaxq_u8(vacc0, voutput_min); vacc1 = vmaxq_u8(vacc1, voutput_min); vacc2 = vmaxq_u8(vacc2, voutput_min); vacc3 = vmaxq_u8(vacc3, voutput_min); vacc0 = vminq_u8(vacc0, voutput_max); vacc1 = vminq_u8(vacc1, voutput_max); vacc2 = vminq_u8(vacc2, voutput_max); vacc3 = vminq_u8(vacc3, voutput_max); vst1q_u8(output, vacc0); output += 16; vst1q_u8(output, vacc1); output += 16; vst1q_u8(output, vacc2); output += 16; vst1q_u8(output, vacc3); output += 16; } for (; batch >= 8; batch -= 8) { uint8x8_t vacc = vld1_u8(input); input += 8; vacc = vmin_u8(vacc, vget_low_u8(voutput_max)); vacc = vmax_u8(vacc, vget_low_u8(voutput_min)); vst1_u8(output, vacc); output += 8; } if XNN_UNLIKELY(batch != 0) { uint8x8_t vacc = vld1_u8(input); input += 8; vacc = vmin_u8(vacc, vget_low_u8(voutput_max)); vacc = vmax_u8(vacc, vget_low_u8(voutput_min)); if (batch & 4) { vst1_lane_u32((void*) output, vreinterpret_u32_u8(vacc), 0); output += 4; vacc = vext_u8(vacc, vacc, 4); } if (batch & 2) { vst1_lane_u16((void*) output, vreinterpret_u16_u8(vacc), 0); output += 2; vacc = vext_u8(vacc, vacc, 2); } if (batch & 1) { vst1_lane_u8(output, vacc, 0); } } } void xnn_x16_packw_gemm_goi_ukernel_x16__neon_ld4lane_x8_prfm( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint16_t* weights, const uint16_t* bias, uint16_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 16); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); uint16x8x4_t vtmp0123x01234567; vtmp0123x01234567.val[0] = vdupq_n_u16(0); vtmp0123x01234567.val[1] = vdupq_n_u16(0); vtmp0123x01234567.val[2] = vdupq_n_u16(0); vtmp0123x01234567.val[3] = vdupq_n_u16(0); uint16x8x4_t vtmp4567x01234567; vtmp4567x01234567.val[0] = vdupq_n_u16(0); vtmp4567x01234567.val[1] = vdupq_n_u16(0); vtmp4567x01234567.val[2] = vdupq_n_u16(0); vtmp4567x01234567.val[3] = vdupq_n_u16(0); uint16x8x4_t vtmp0123x89ABCDEF; vtmp0123x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp0123x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp0123x89ABCDEF.val[2] = vdupq_n_u16(0); vtmp0123x89ABCDEF.val[3] = vdupq_n_u16(0); uint16x8x4_t vtmp4567x89ABCDEF; vtmp4567x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp4567x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp4567x89ABCDEF.val[2] = vdupq_n_u16(0); vtmp4567x89ABCDEF.val[3] = vdupq_n_u16(0); do { // NC main loop multiple of 16 const uint16_t* w0 = weights; size_t n = nc; for (; n >= 16; n -= 16) { if XNN_LIKELY(bias != NULL) { uint16x8_t vb0 = vld1q_u16(bias); bias += 8; uint16x8_t vb8 = vld1q_u16(bias); bias += 8; vst1q_u16(packed_weights, vb0); packed_weights += 8; vst1q_u16(packed_weights, vb8); packed_weights += 8; } else { const uint16x8_t vzero = vmovq_n_u16(0); vst1q_u16(packed_weights, vzero); packed_weights += 8; vst1q_u16(packed_weights, vzero); packed_weights += 8; } const uint16_t* w1 = w0 + kc; const uint16_t* w2 = w1 + kc; const uint16_t* w3 = w2 + kc; const uint16_t* w4 = w3 + kc; const uint16_t* w5 = w4 + kc; const uint16_t* w6 = w5 + kc; const uint16_t* w7 = w6 + kc; const uint16_t* w8 = w7 + kc; const uint16_t* w9 = w8 + kc; const uint16_t* w10 = w9 + kc; const uint16_t* w11 = w10 + kc; const uint16_t* w12 = w11 + kc; const uint16_t* w13 = w12 + kc; const uint16_t* w14 = w13 + kc; const uint16_t* w15 = w14 + kc; xnn_prefetch_to_l1((const int8_t*) w0); xnn_prefetch_to_l1((const int8_t*) w0 + 64); xnn_prefetch_to_l1((const int8_t*) w1); xnn_prefetch_to_l1((const int8_t*) w1 + 64); xnn_prefetch_to_l1((const int8_t*) w2); xnn_prefetch_to_l1((const int8_t*) w2 + 64); xnn_prefetch_to_l1((const int8_t*) w3); xnn_prefetch_to_l1((const int8_t*) w3 + 64); xnn_prefetch_to_l1((const int8_t*) w4); xnn_prefetch_to_l1((const int8_t*) w4 + 64); xnn_prefetch_to_l1((const int8_t*) w5); xnn_prefetch_to_l1((const int8_t*) w5 + 64); xnn_prefetch_to_l1((const int8_t*) w6); xnn_prefetch_to_l1((const int8_t*) w6 + 64); xnn_prefetch_to_l1((const int8_t*) w7); xnn_prefetch_to_l1((const int8_t*) w7 + 64); xnn_prefetch_to_l1((const int8_t*) w8); xnn_prefetch_to_l1((const int8_t*) w8 + 64); xnn_prefetch_to_l1((const int8_t*) w9); xnn_prefetch_to_l1((const int8_t*) w9 + 64); xnn_prefetch_to_l1((const int8_t*) w10); xnn_prefetch_to_l1((const int8_t*) w10 + 64); xnn_prefetch_to_l1((const int8_t*) w11); xnn_prefetch_to_l1((const int8_t*) w11 + 64); xnn_prefetch_to_l1((const int8_t*) w12); xnn_prefetch_to_l1((const int8_t*) w12 + 64); xnn_prefetch_to_l1((const int8_t*) w13); xnn_prefetch_to_l1((const int8_t*) w13 + 64); xnn_prefetch_to_l1((const int8_t*) w14); xnn_prefetch_to_l1((const int8_t*) w14 + 64); xnn_prefetch_to_l1((const int8_t*) w15); xnn_prefetch_to_l1((const int8_t*) w15 + 64); // KC main loop multiple of 8 size_t k = kc; for (; k >= 8; k -= 8) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w15, vtmp0123x89ABCDEF, 7); w15 += 4; vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4; vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4; vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4; vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4; vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4; vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4; vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4; vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w8, vtmp4567x89ABCDEF, 0); w8 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w9, vtmp4567x89ABCDEF, 1); w9 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w10, vtmp4567x89ABCDEF, 2); w10 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w11, vtmp4567x89ABCDEF, 3); w11 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w12, vtmp4567x89ABCDEF, 4); w12 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w13, vtmp4567x89ABCDEF, 5); w13 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w14, vtmp4567x89ABCDEF, 6); w14 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w15, vtmp4567x89ABCDEF, 7); w15 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); xnn_prefetch_to_l1((const int8_t*) w7 + 128); xnn_prefetch_to_l1((const int8_t*) w8 + 128); xnn_prefetch_to_l1((const int8_t*) w9 + 128); xnn_prefetch_to_l1((const int8_t*) w10 + 128); xnn_prefetch_to_l1((const int8_t*) w11 + 128); xnn_prefetch_to_l1((const int8_t*) w12 + 128); xnn_prefetch_to_l1((const int8_t*) w13 + 128); xnn_prefetch_to_l1((const int8_t*) w14 + 128); xnn_prefetch_to_l1((const int8_t*) w15 + 128); vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[3]); packed_weights += 8; } // KC remainder multiple of 4 if (k >= 4) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w15, vtmp0123x89ABCDEF, 7); w15 += 4; vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8; k -= 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 16x1 case 1: { uint16x8_t vtmp0x01234567 = vdupq_n_u16(0); uint16x8_t vtmp0x89ABCDEF = vdupq_n_u16(0); vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1; vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1; vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1; vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1; vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1; vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1; vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1; vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w8, vtmp0x89ABCDEF, 0); w8 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w9, vtmp0x89ABCDEF, 1); w9 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w10, vtmp0x89ABCDEF, 2); w10 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w11, vtmp0x89ABCDEF, 3); w11 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w12, vtmp0x89ABCDEF, 4); w12 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w13, vtmp0x89ABCDEF, 5); w13 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w14, vtmp0x89ABCDEF, 6); w14 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w15, vtmp0x89ABCDEF, 7); w15 += 1; vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8; vst1q_u16(packed_weights, vtmp0x89ABCDEF); packed_weights += 8; break; } // KC remainder of 16x2 case 2: { uint16x8x2_t vtmp01x01234567; vtmp01x01234567.val[0] = vdupq_n_u16(0); vtmp01x01234567.val[1] = vdupq_n_u16(0); uint16x8x2_t vtmp01x89ABCDEF; vtmp01x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp01x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2; vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2; vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2; vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2; vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2; vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2; vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2; vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w8, vtmp01x89ABCDEF, 0); w8 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w9, vtmp01x89ABCDEF, 1); w9 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w10, vtmp01x89ABCDEF, 2); w10 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w11, vtmp01x89ABCDEF, 3); w11 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w12, vtmp01x89ABCDEF, 4); w12 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w13, vtmp01x89ABCDEF, 5); w13 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w14, vtmp01x89ABCDEF, 6); w14 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w15, vtmp01x89ABCDEF, 7); w15 += 2; vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[1]); packed_weights += 8; break; } // KC remainder of 16x3 case 3: { uint16x8x3_t vtmp012x01234567; vtmp012x01234567.val[0] = vdupq_n_u16(0); vtmp012x01234567.val[1] = vdupq_n_u16(0); vtmp012x01234567.val[2] = vdupq_n_u16(0); uint16x8x3_t vtmp012x89ABCDEF; vtmp012x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp012x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp012x89ABCDEF.val[2] = vdupq_n_u16(0); vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3; vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3; vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3; vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3; vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3; vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3; vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3; vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w8, vtmp012x89ABCDEF, 0); w8 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w9, vtmp012x89ABCDEF, 1); w9 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w10, vtmp012x89ABCDEF, 2); w10 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w11, vtmp012x89ABCDEF, 3); w11 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w12, vtmp012x89ABCDEF, 4); w12 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w13, vtmp012x89ABCDEF, 5); w13 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w14, vtmp012x89ABCDEF, 6); w14 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w15, vtmp012x89ABCDEF, 7); w15 += 3; vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[2]); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w15; } // NC remainder (1..15) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 15); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (16 - n); } else { const uint16x8_t vzero = vmovq_n_u16(0); vst1q_u16(packed_weights, vzero); packed_weights += 8; vst1q_u16(packed_weights, vzero); packed_weights += 8; } // NR remainder has less than 16 rows so last row is not loaded const uint16_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint16_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint16_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint16_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint16_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint16_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } const uint16_t* w7 = w6 + kc; if XNN_UNPREDICTABLE(n < 8) { w7 = w6; } const uint16_t* w8 = w7 + kc; if XNN_UNPREDICTABLE(n <= 8) { w8 = w7; } const uint16_t* w9 = w8 + kc; if XNN_UNPREDICTABLE(n < 10) { w9 = w8; } const uint16_t* w10 = w9 + kc; if XNN_UNPREDICTABLE(n <= 10) { w10 = w9; } const uint16_t* w11 = w10 + kc; if XNN_UNPREDICTABLE(n < 12) { w11 = w10; } const uint16_t* w12 = w11 + kc; if XNN_UNPREDICTABLE(n <= 12) { w12 = w11; } const uint16_t* w13 = w12 + kc; if XNN_UNPREDICTABLE(n < 14) { w13 = w12; } const uint16_t* w14 = w13 + kc; if XNN_UNPREDICTABLE(n <= 14) { w14 = w13; } // KC main loop multiple of 8 size_t k = kc; for (; k >= 8; k -= 8) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4; vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4; vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4; vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4; vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4; vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4; vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4; vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4; vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w8, vtmp4567x89ABCDEF, 0); w8 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w9, vtmp4567x89ABCDEF, 1); w9 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w10, vtmp4567x89ABCDEF, 2); w10 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w11, vtmp4567x89ABCDEF, 3); w11 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w12, vtmp4567x89ABCDEF, 4); w12 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w13, vtmp4567x89ABCDEF, 5); w13 += 4; vtmp4567x89ABCDEF = vld4q_lane_u16(w14, vtmp4567x89ABCDEF, 6); w14 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); xnn_prefetch_to_l1((const int8_t*) w7 + 128); xnn_prefetch_to_l1((const int8_t*) w8 + 128); xnn_prefetch_to_l1((const int8_t*) w9 + 128); xnn_prefetch_to_l1((const int8_t*) w10 + 128); xnn_prefetch_to_l1((const int8_t*) w11 + 128); xnn_prefetch_to_l1((const int8_t*) w12 + 128); xnn_prefetch_to_l1((const int8_t*) w13 + 128); xnn_prefetch_to_l1((const int8_t*) w14 + 128); vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x89ABCDEF.val[3]); packed_weights += 8; } // KC remainder multiple of 4 if (k >= 4) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w8, vtmp0123x89ABCDEF, 0); w8 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w9, vtmp0123x89ABCDEF, 1); w9 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w10, vtmp0123x89ABCDEF, 2); w10 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w11, vtmp0123x89ABCDEF, 3); w11 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w12, vtmp0123x89ABCDEF, 4); w12 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w13, vtmp0123x89ABCDEF, 5); w13 += 4; vtmp0123x89ABCDEF = vld4q_lane_u16(w14, vtmp0123x89ABCDEF, 6); w14 += 4; vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x89ABCDEF.val[3]); packed_weights += 8; k -= 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 16x1 case 1: { uint16x8_t vtmp0x01234567 = vdupq_n_u16(0); uint16x8_t vtmp0x89ABCDEF = vdupq_n_u16(0); vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1; vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1; vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1; vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1; vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1; vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1; vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1; vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w8, vtmp0x89ABCDEF, 0); w8 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w9, vtmp0x89ABCDEF, 1); w9 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w10, vtmp0x89ABCDEF, 2); w10 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w11, vtmp0x89ABCDEF, 3); w11 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w12, vtmp0x89ABCDEF, 4); w12 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w13, vtmp0x89ABCDEF, 5); w13 += 1; vtmp0x89ABCDEF = vld1q_lane_u16(w14, vtmp0x89ABCDEF, 6); w14 += 1; vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8; vst1q_u16(packed_weights, vtmp0x89ABCDEF); packed_weights += 8; break; } // KC remainder of 16x2 case 2: { uint16x8x2_t vtmp01x01234567; vtmp01x01234567.val[0] = vdupq_n_u16(0); vtmp01x01234567.val[1] = vdupq_n_u16(0); uint16x8x2_t vtmp01x89ABCDEF; vtmp01x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp01x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2; vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2; vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2; vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2; vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2; vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2; vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2; vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w8, vtmp01x89ABCDEF, 0); w8 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w9, vtmp01x89ABCDEF, 1); w9 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w10, vtmp01x89ABCDEF, 2); w10 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w11, vtmp01x89ABCDEF, 3); w11 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w12, vtmp01x89ABCDEF, 4); w12 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w13, vtmp01x89ABCDEF, 5); w13 += 2; vtmp01x89ABCDEF = vld2q_lane_u16(w14, vtmp01x89ABCDEF, 6); w14 += 2; vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x89ABCDEF.val[1]); packed_weights += 8; break; } // KC remainder of 16x3 case 3: { uint16x8x3_t vtmp012x01234567; vtmp012x01234567.val[0] = vdupq_n_u16(0); vtmp012x01234567.val[1] = vdupq_n_u16(0); vtmp012x01234567.val[2] = vdupq_n_u16(0); uint16x8x3_t vtmp012x89ABCDEF; vtmp012x89ABCDEF.val[0] = vdupq_n_u16(0); vtmp012x89ABCDEF.val[1] = vdupq_n_u16(0); vtmp012x89ABCDEF.val[2] = vdupq_n_u16(0); vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3; vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3; vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3; vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3; vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3; vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3; vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3; vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w8, vtmp012x89ABCDEF, 0); w8 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w9, vtmp012x89ABCDEF, 1); w9 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w10, vtmp012x89ABCDEF, 2); w10 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w11, vtmp012x89ABCDEF, 3); w11 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w12, vtmp012x89ABCDEF, 4); w12 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w13, vtmp012x89ABCDEF, 5); w13 += 3; vtmp012x89ABCDEF = vld3q_lane_u16(w14, vtmp012x89ABCDEF, 6); w14 += 3; vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x89ABCDEF.val[2]); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x16_packw_gemm_goi_ukernel_x8__neon_ld4lane_x8_prfm( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint16_t* weights, const uint16_t* bias, uint16_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); uint16x8x4_t vtmp0123x01234567; vtmp0123x01234567.val[0] = vdupq_n_u16(0); vtmp0123x01234567.val[1] = vdupq_n_u16(0); vtmp0123x01234567.val[2] = vdupq_n_u16(0); vtmp0123x01234567.val[3] = vdupq_n_u16(0); uint16x8x4_t vtmp4567x01234567; vtmp4567x01234567.val[0] = vdupq_n_u16(0); vtmp4567x01234567.val[1] = vdupq_n_u16(0); vtmp4567x01234567.val[2] = vdupq_n_u16(0); vtmp4567x01234567.val[3] = vdupq_n_u16(0); do { // NC main loop multiple of 8 const uint16_t* w0 = weights; size_t n = nc; for (; n >= 8; n -= 8) { if XNN_LIKELY(bias != NULL) { uint16x8_t vb0 = vld1q_u16(bias); bias += 8; vst1q_u16(packed_weights, vb0); packed_weights += 8; } else { const uint16x8_t vzero = vmovq_n_u16(0); vst1q_u16(packed_weights, vzero); packed_weights += 8; } const uint16_t* w1 = w0 + kc; const uint16_t* w2 = w1 + kc; const uint16_t* w3 = w2 + kc; const uint16_t* w4 = w3 + kc; const uint16_t* w5 = w4 + kc; const uint16_t* w6 = w5 + kc; const uint16_t* w7 = w6 + kc; xnn_prefetch_to_l1((const int8_t*) w0); xnn_prefetch_to_l1((const int8_t*) w0 + 64); xnn_prefetch_to_l1((const int8_t*) w1); xnn_prefetch_to_l1((const int8_t*) w1 + 64); xnn_prefetch_to_l1((const int8_t*) w2); xnn_prefetch_to_l1((const int8_t*) w2 + 64); xnn_prefetch_to_l1((const int8_t*) w3); xnn_prefetch_to_l1((const int8_t*) w3 + 64); xnn_prefetch_to_l1((const int8_t*) w4); xnn_prefetch_to_l1((const int8_t*) w4 + 64); xnn_prefetch_to_l1((const int8_t*) w5); xnn_prefetch_to_l1((const int8_t*) w5 + 64); xnn_prefetch_to_l1((const int8_t*) w6); xnn_prefetch_to_l1((const int8_t*) w6 + 64); xnn_prefetch_to_l1((const int8_t*) w7); xnn_prefetch_to_l1((const int8_t*) w7 + 64); // KC main loop multiple of 8 size_t k = kc; for (; k >= 8; k -= 8) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4; vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4; vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4; vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4; vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4; vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4; vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4; vtmp4567x01234567 = vld4q_lane_u16(w7, vtmp4567x01234567, 7); w7 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); xnn_prefetch_to_l1((const int8_t*) w7 + 128); vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8; } // KC remainder multiple of 4 if (k >= 4) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp0123x01234567 = vld4q_lane_u16(w7, vtmp0123x01234567, 7); w7 += 4; vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; k -= 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 8x1 case 1: { uint16x8_t vtmp0x01234567 = vdupq_n_u16(0); vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1; vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1; vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1; vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1; vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1; vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1; vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1; vtmp0x01234567 = vld1q_lane_u16(w7, vtmp0x01234567, 7); w7 += 1; vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8; break; } // KC remainder of 8x2 case 2: { uint16x8x2_t vtmp01x01234567; vtmp01x01234567.val[0] = vdupq_n_u16(0); vtmp01x01234567.val[1] = vdupq_n_u16(0); vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2; vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2; vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2; vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2; vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2; vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2; vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2; vtmp01x01234567 = vld2q_lane_u16(w7, vtmp01x01234567, 7); w7 += 2; vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8; break; } // KC remainder of 8x3 case 3: { uint16x8x3_t vtmp012x01234567; vtmp012x01234567.val[0] = vdupq_n_u16(0); vtmp012x01234567.val[1] = vdupq_n_u16(0); vtmp012x01234567.val[2] = vdupq_n_u16(0); vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3; vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3; vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3; vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3; vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3; vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3; vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3; vtmp012x01234567 = vld3q_lane_u16(w7, vtmp012x01234567, 7); w7 += 3; vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 7); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (8 - n); } else { const uint16x8_t vzero = vmovq_n_u16(0); vst1q_u16(packed_weights, vzero); packed_weights += 8; } // NR remainder has less than 8 rows so last row is not loaded const uint16_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint16_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint16_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint16_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint16_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint16_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } // KC main loop multiple of 8 size_t k = kc; for (; k >= 8; k -= 8) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vtmp4567x01234567 = vld4q_lane_u16(w0, vtmp4567x01234567, 0); w0 += 4; vtmp4567x01234567 = vld4q_lane_u16(w1, vtmp4567x01234567, 1); w1 += 4; vtmp4567x01234567 = vld4q_lane_u16(w2, vtmp4567x01234567, 2); w2 += 4; vtmp4567x01234567 = vld4q_lane_u16(w3, vtmp4567x01234567, 3); w3 += 4; vtmp4567x01234567 = vld4q_lane_u16(w4, vtmp4567x01234567, 4); w4 += 4; vtmp4567x01234567 = vld4q_lane_u16(w5, vtmp4567x01234567, 5); w5 += 4; vtmp4567x01234567 = vld4q_lane_u16(w6, vtmp4567x01234567, 6); w6 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp4567x01234567.val[3]); packed_weights += 8; } // KC remainder multiple of 4 if (k >= 4) { vtmp0123x01234567 = vld4q_lane_u16(w0, vtmp0123x01234567, 0); w0 += 4; vtmp0123x01234567 = vld4q_lane_u16(w1, vtmp0123x01234567, 1); w1 += 4; vtmp0123x01234567 = vld4q_lane_u16(w2, vtmp0123x01234567, 2); w2 += 4; vtmp0123x01234567 = vld4q_lane_u16(w3, vtmp0123x01234567, 3); w3 += 4; vtmp0123x01234567 = vld4q_lane_u16(w4, vtmp0123x01234567, 4); w4 += 4; vtmp0123x01234567 = vld4q_lane_u16(w5, vtmp0123x01234567, 5); w5 += 4; vtmp0123x01234567 = vld4q_lane_u16(w6, vtmp0123x01234567, 6); w6 += 4; vst1q_u16(packed_weights, vtmp0123x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[2]); packed_weights += 8; vst1q_u16(packed_weights, vtmp0123x01234567.val[3]); packed_weights += 8; k -= 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 8x1 case 1: { uint16x8_t vtmp0x01234567 = vdupq_n_u16(0); vtmp0x01234567 = vld1q_lane_u16(w0, vtmp0x01234567, 0); w0 += 1; vtmp0x01234567 = vld1q_lane_u16(w1, vtmp0x01234567, 1); w1 += 1; vtmp0x01234567 = vld1q_lane_u16(w2, vtmp0x01234567, 2); w2 += 1; vtmp0x01234567 = vld1q_lane_u16(w3, vtmp0x01234567, 3); w3 += 1; vtmp0x01234567 = vld1q_lane_u16(w4, vtmp0x01234567, 4); w4 += 1; vtmp0x01234567 = vld1q_lane_u16(w5, vtmp0x01234567, 5); w5 += 1; vtmp0x01234567 = vld1q_lane_u16(w6, vtmp0x01234567, 6); w6 += 1; vst1q_u16(packed_weights, vtmp0x01234567); packed_weights += 8; break; } // KC remainder of 8x2 case 2: { uint16x8x2_t vtmp01x01234567; vtmp01x01234567.val[0] = vdupq_n_u16(0); vtmp01x01234567.val[1] = vdupq_n_u16(0); vtmp01x01234567 = vld2q_lane_u16(w0, vtmp01x01234567, 0); w0 += 2; vtmp01x01234567 = vld2q_lane_u16(w1, vtmp01x01234567, 1); w1 += 2; vtmp01x01234567 = vld2q_lane_u16(w2, vtmp01x01234567, 2); w2 += 2; vtmp01x01234567 = vld2q_lane_u16(w3, vtmp01x01234567, 3); w3 += 2; vtmp01x01234567 = vld2q_lane_u16(w4, vtmp01x01234567, 4); w4 += 2; vtmp01x01234567 = vld2q_lane_u16(w5, vtmp01x01234567, 5); w5 += 2; vtmp01x01234567 = vld2q_lane_u16(w6, vtmp01x01234567, 6); w6 += 2; vst1q_u16(packed_weights, vtmp01x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp01x01234567.val[1]); packed_weights += 8; break; } // KC remainder of 8x3 case 3: { uint16x8x3_t vtmp012x01234567; vtmp012x01234567.val[0] = vdupq_n_u16(0); vtmp012x01234567.val[1] = vdupq_n_u16(0); vtmp012x01234567.val[2] = vdupq_n_u16(0); vtmp012x01234567 = vld3q_lane_u16(w0, vtmp012x01234567, 0); w0 += 3; vtmp012x01234567 = vld3q_lane_u16(w1, vtmp012x01234567, 1); w1 += 3; vtmp012x01234567 = vld3q_lane_u16(w2, vtmp012x01234567, 2); w2 += 3; vtmp012x01234567 = vld3q_lane_u16(w3, vtmp012x01234567, 3); w3 += 3; vtmp012x01234567 = vld3q_lane_u16(w4, vtmp012x01234567, 4); w4 += 3; vtmp012x01234567 = vld3q_lane_u16(w5, vtmp012x01234567, 5); w5 += 3; vtmp012x01234567 = vld3q_lane_u16(w6, vtmp012x01234567, 6); w6 += 3; vst1q_u16(packed_weights, vtmp012x01234567.val[0]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[1]); packed_weights += 8; vst1q_u16(packed_weights, vtmp012x01234567.val[2]); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon( const uint16_t* input, uint16_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x16_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint16_t)); assert(input_stride >= block_width * sizeof(uint16_t)); const size_t tile_height = 8; const size_t tile_width = 8; const size_t tile_hbytes = tile_height * sizeof(uint16_t); const size_t tile_wbytes = tile_width * sizeof(uint16_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes; const uint16_t* i0 = input; uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 7); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 8; bh -= 8) { const uint16x8_t v3_0 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_1 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_2 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_3 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_4 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_5 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_6 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8_t v3_7 = vld1q_u16(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const uint16x8x2_t v2_0 = vzipq_u16(v3_0, v3_4); const uint16x8x2_t v2_1 = vzipq_u16(v3_1, v3_5); const uint16x8x2_t v2_2 = vzipq_u16(v3_2, v3_6); const uint16x8x2_t v2_3 = vzipq_u16(v3_3, v3_7); const uint16x8x2_t v1_0 = vzipq_u16(v2_0.val[0], v2_2.val[0]); const uint16x8x2_t v1_1 = vzipq_u16(v2_0.val[1], v2_2.val[1]); const uint16x8x2_t v1_2 = vzipq_u16(v2_1.val[0], v2_3.val[0]); const uint16x8x2_t v1_3 = vzipq_u16(v2_1.val[1], v2_3.val[1]); const uint16x8x2_t v0_0 = vzipq_u16(v1_0.val[0], v1_2.val[0]); const uint16x8x2_t v0_1 = vzipq_u16(v1_0.val[1], v1_2.val[1]); const uint16x8x2_t v0_2 = vzipq_u16(v1_1.val[0], v1_3.val[0]); const uint16x8x2_t v0_3 = vzipq_u16(v1_1.val[1], v1_3.val[1]); o = (uint16_t*) ((uintptr_t) o + oN_offset); vst1q_u16(o, v0_3.val[1]); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_3.val[0]); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_2.val[1]); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_2.val[0]); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_1.val[1]); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_1.val[0]); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_0.val[1]); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u16(o, v0_0.val[0]); } o = (uint16_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const uint16x8_t v3_0 = vld1q_u16(i0); const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const uint16x8_t v3_1 = vld1q_u16(i1); const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const uint16x8_t v3_2 = vld1q_u16(i2); const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(bh < 4) { i3 = i2; } const uint16x8_t v3_3 = vld1q_u16(i3); const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(bh <= 4) { i4 = i3; } const uint16x8_t v3_4 = vld1q_u16(i4); const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(bh < 6) { i5 = i4; } const uint16x8_t v3_5 = vld1q_u16(i5); const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(bh <= 6) { i6 = i5; } const uint16x8_t v3_6 = vld1q_u16(i6); const uint16x8_t v3_7 = vmovq_n_u16(0); const uint16x8x2_t v2_0 = vzipq_u16(v3_0, v3_4); const uint16x8x2_t v2_1 = vzipq_u16(v3_1, v3_5); const uint16x8x2_t v2_2 = vzipq_u16(v3_2, v3_6); const uint16x8x2_t v2_3 = vzipq_u16(v3_3, v3_7); const uint16x8x2_t v1_0 = vzipq_u16(v2_0.val[0], v2_2.val[0]); const uint16x8x2_t v1_1 = vzipq_u16(v2_0.val[1], v2_2.val[1]); const uint16x8x2_t v1_2 = vzipq_u16(v2_1.val[0], v2_3.val[0]); const uint16x8x2_t v1_3 = vzipq_u16(v2_1.val[1], v2_3.val[1]); const uint16x8x2_t v0_0 = vzipq_u16(v1_0.val[0], v1_2.val[0]); const uint16x8x2_t v0_1 = vzipq_u16(v1_0.val[1], v1_2.val[1]); const uint16x8x2_t v0_2 = vzipq_u16(v1_1.val[0], v1_3.val[0]); const uint16x8x2_t v0_3 = vzipq_u16(v1_1.val[1], v1_3.val[1]); uint16x4_t v0_low = vget_low_u16(v0_0.val[0]); uint16x4_t v1_low = vget_low_u16(v0_0.val[1]); uint16x4_t v2_low = vget_low_u16(v0_1.val[0]); uint16x4_t v3_low = vget_low_u16(v0_1.val[1]); uint16x4_t v4_low = vget_low_u16(v0_2.val[0]); uint16x4_t v5_low = vget_low_u16(v0_2.val[1]); uint16x4_t v6_low = vget_low_u16(v0_3.val[0]); uint16x4_t v7_low = vget_low_u16(v0_3.val[1]); if (bh & 4) { o = (uint16_t*) ((uintptr_t) o + oN_stride); vst1_u16(o, v7_low); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v6_low); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v5_low); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v4_low); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v3_low); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v2_low); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v1_low); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_u16(o, v0_low); o += 4; v0_low = vget_high_u16(v0_0.val[0]); v1_low = vget_high_u16(v0_0.val[1]); v2_low = vget_high_u16(v0_1.val[0]); v3_low = vget_high_u16(v0_1.val[1]); v4_low = vget_high_u16(v0_2.val[0]); v5_low = vget_high_u16(v0_2.val[1]); v6_low = vget_high_u16(v0_3.val[0]); v7_low = vget_high_u16(v0_3.val[1]); } if (bh & 2) { o = (uint16_t*) ((uintptr_t) o + oN_stride); vst1_lane_u32((void*) o, vreinterpret_u32_u16(v7_low), 0); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v6_low), 0); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v5_low), 0); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v4_low), 0); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v3_low), 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v2_low), 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v1_low), 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u16(v0_low), 0); o += 2; v0_low = vext_u16(v0_low, v0_low, 2); v1_low = vext_u16(v1_low, v1_low, 2); v2_low = vext_u16(v2_low, v2_low, 2); v3_low = vext_u16(v3_low, v3_low, 2); v4_low = vext_u16(v4_low, v4_low, 2); v5_low = vext_u16(v5_low, v5_low, 2); v6_low = vext_u16(v6_low, v6_low, 2); v7_low = vext_u16(v7_low, v7_low, 2); } if (bh & 1) { o = (uint16_t*) ((uintptr_t) o + oN_stride); vst1_lane_u16(o, v7_low, 0); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v6_low, 0); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v5_low, 0); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v4_low, 0); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v3_low, 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v2_low, 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v1_low, 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint16_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16(o, v0_low, 0); } } i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset); o = (uint16_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x24_transposec_ukernel__2x2_neon_tbl64( const void* input, void* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x24_transpose_params* params) XNN_OOB_READS { assert(output_stride >= block_height * 3); assert(input_stride >= block_width * 3); const size_t tile_height = 2; const size_t tile_width = 2; const size_t tile_wbytes = tile_width * 3; const size_t tile_wbytes_minus_4 = tile_wbytes - 4; const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_height * output_stride - block_height * 3; const size_t tile_stride = tile_height * input_stride; const uint8_t* i0 = (const uint8_t*) input; const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); uint8_t* o0 = (uint8_t*) output; uint8_t* o1 = (uint8_t*) ((uintptr_t) o0 + output_stride); const uint8x8_t vperm0 = vld1_u8(params->neon_tbl64.pos0); const uint8x8_t vperm1 = vld1_u8(params->neon_tbl64.pos1); do { if XNN_UNPREDICTABLE(block_width < 2) { o1 = o0; } size_t bh = block_height; for (; bh >= 2; bh -= 2) { uint8x8x2_t v; v.val[0] = vld1_u8(i0); i0 = (const uint8_t*) ((uintptr_t) i0 + tile_stride); v.val[1] = vld1_u8(i1); i1 = (const uint8_t*) ((uintptr_t) i1 + tile_stride); const uint8x8_t vres0 = vtbl2_u8(v, vperm0); const uint8x8_t vres1 = vtbl2_u8(v, vperm1); vst1_lane_u32((void*) o1, vreinterpret_u32_u8(vres1), 0); o1 = (uint8_t*) ((uintptr_t) o1 + 4); vst1_lane_u32((void*) o0, vreinterpret_u32_u8(vres0), 0); o0 = (uint8_t*) ((uintptr_t) o0 + 4); vst1_lane_u16((void*) o1, vreinterpret_u16_u8(vres1), 2); o1 = (uint8_t*) ((uintptr_t) o1 + tile_wbytes_minus_4); vst1_lane_u16((void*) o0, vreinterpret_u16_u8(vres0), 2); o0 = (uint8_t*) ((uintptr_t) o0 + tile_wbytes_minus_4); } if (bh != 0) { if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } uint8x8_t v = vld1_u8(i0); const uint8x8_t vres0 = vtbl1_u8(v, vperm0); const uint8x8_t vres1 = vtbl1_u8(v, vperm1); if (bh & 1) { vst1_lane_u16((void*) o1, vreinterpret_u16_u8(vres1), 0); o1 += 2; vst1_lane_u16((void*) o0, vreinterpret_u16_u8(vres0), 0); o0 += 2; vst1_lane_u8(o1, vres1, 2); o1 += 1; vst1_lane_u8(o0, vres0, 2); o0 += 1; } } i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset); i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); o0 = (uint8_t*) ((uintptr_t) o0 + output_reset); o1 = (uint8_t*) ((uintptr_t) o1 + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x32_packw_gemm_goi_ukernel_x2__neon_ld2lane_x2_prfm( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint32_t* weights, const uint32_t* bias, uint32_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 2); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); uint32x2x2_t v00; v00.val[0] = vdup_n_u32(0); v00.val[1] = vdup_n_u32(0); do { // NC main loop multiple of 2 const uint32_t* w0 = weights; size_t n = nc; for (; n >= 2; n -= 2) { if XNN_LIKELY(bias != NULL) { uint32x2_t vb0 = vld1_u32(bias); bias += 2; vst1_u32(packed_weights, vb0); packed_weights += 2; } else { const uint32x2_t vzero = vmov_n_u32(0); vst1_u32(packed_weights, vzero); packed_weights += 2; } const uint32_t* w1 = w0 + kc; xnn_prefetch_to_l1((const int8_t*) w0); xnn_prefetch_to_l1((const int8_t*) w0 + 64); xnn_prefetch_to_l1((const int8_t*) w1); xnn_prefetch_to_l1((const int8_t*) w1 + 64); // KC main loop multiple of 2 size_t k = kc; for (; k >= 2; k -= 2) { v00 = vld2_lane_u32(w0, v00, 0); w0 += 2; v00 = vld2_lane_u32(w1, v00, 1); w1 += 2; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); vst1_u32(packed_weights + 0, v00.val[0]); vst1_u32(packed_weights + 2, v00.val[1]); packed_weights += 4; } // KC remainder for (; k != 0; --k) { v00.val[0] = vld1_lane_u32(w0, v00.val[0], 0); w0 += 1; v00.val[0] = vld1_lane_u32(w1, v00.val[0], 1); w1 += 1; vst1_u32(packed_weights + 0, v00.val[0]); packed_weights += 2; } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w1; } if XNN_UNLIKELY(n != 0) { // NC remainder of 1 if XNN_LIKELY(bias != NULL) { *packed_weights = *bias++; } else { const uint32x2_t vzero = vmov_n_u32(0); vst1_u32(packed_weights + 0, vzero); } packed_weights += 2; size_t k = kc; do { *packed_weights = *w0++; packed_weights += 2; } while (--k); packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_x4_prfm( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint32_t* weights, const uint32_t* bias, uint32_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); uint32x4x4_t vtmp0123x0123; vtmp0123x0123.val[0] = vdupq_n_u32(0); vtmp0123x0123.val[1] = vdupq_n_u32(0); vtmp0123x0123.val[2] = vdupq_n_u32(0); vtmp0123x0123.val[3] = vdupq_n_u32(0); uint32x4x4_t vtmp0123x4567; vtmp0123x4567.val[0] = vdupq_n_u32(0); vtmp0123x4567.val[1] = vdupq_n_u32(0); vtmp0123x4567.val[2] = vdupq_n_u32(0); vtmp0123x4567.val[3] = vdupq_n_u32(0); do { // NC main loop multiple of 8 const uint32_t* w0 = weights; size_t n = nc; for (; n >= 8; n -= 8) { if XNN_LIKELY(bias != NULL) { uint32x4_t vb0 = vld1q_u32(bias); bias += 4; uint32x4_t vb4 = vld1q_u32(bias); bias += 4; vst1q_u32(packed_weights, vb0); packed_weights += 4; vst1q_u32(packed_weights, vb4); packed_weights += 4; } else { const uint32x4_t vzero = vmovq_n_u32(0); vst1q_u32(packed_weights, vzero); packed_weights += 4; vst1q_u32(packed_weights, vzero); packed_weights += 4; } const uint32_t* w1 = w0 + kc; const uint32_t* w2 = w1 + kc; const uint32_t* w3 = w2 + kc; const uint32_t* w4 = w3 + kc; const uint32_t* w5 = w4 + kc; const uint32_t* w6 = w5 + kc; const uint32_t* w7 = w6 + kc; xnn_prefetch_to_l1((const int8_t*) w0); xnn_prefetch_to_l1((const int8_t*) w0 + 64); xnn_prefetch_to_l1((const int8_t*) w1); xnn_prefetch_to_l1((const int8_t*) w1 + 64); xnn_prefetch_to_l1((const int8_t*) w2); xnn_prefetch_to_l1((const int8_t*) w2 + 64); xnn_prefetch_to_l1((const int8_t*) w3); xnn_prefetch_to_l1((const int8_t*) w3 + 64); xnn_prefetch_to_l1((const int8_t*) w4); xnn_prefetch_to_l1((const int8_t*) w4 + 64); xnn_prefetch_to_l1((const int8_t*) w5); xnn_prefetch_to_l1((const int8_t*) w5 + 64); xnn_prefetch_to_l1((const int8_t*) w6); xnn_prefetch_to_l1((const int8_t*) w6 + 64); xnn_prefetch_to_l1((const int8_t*) w7); xnn_prefetch_to_l1((const int8_t*) w7 + 64); // KC main loop multiple of 4 size_t k = kc; for (; k >= 4; k -= 4) { vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4; vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4; vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4; vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4; vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4; vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4; vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4; vtmp0123x4567 = vld4q_lane_u32(w7, vtmp0123x4567, 3); w7 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); xnn_prefetch_to_l1((const int8_t*) w7 + 128); vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 1 case 1: { uint32x4_t vtmp0x0123 = vdupq_n_u32(0); uint32x4_t vtmp0x4567 = vdupq_n_u32(0); vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); w0 += 1; vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); w1 += 1; vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); w2 += 1; vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); w3 += 1; vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); w4 += 1; vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); w5 += 1; vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); w6 += 1; vtmp0x4567 = vld1q_lane_u32(w7, vtmp0x4567, 3); w7 += 1; vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4; vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4; break; } // KC remainder of 2 case 2: { uint32x4x2_t vtmp01x0123; vtmp01x0123.val[0] = vdupq_n_u32(0); vtmp01x0123.val[1] = vdupq_n_u32(0); uint32x4x2_t vtmp01x4567; vtmp01x4567.val[0] = vdupq_n_u32(0); vtmp01x4567.val[1] = vdupq_n_u32(0); vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); w0 += 2; vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); w1 += 2; vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); w2 += 2; vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); w3 += 2; vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); w4 += 2; vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); w5 += 2; vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); w6 += 2; vtmp01x4567 = vld2q_lane_u32(w7, vtmp01x4567, 3); w7 += 2; vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4; break; } // KC remainder of 3 case 3: { uint32x4x3_t vtmp012x0123; vtmp012x0123.val[0] = vdupq_n_u32(0); vtmp012x0123.val[1] = vdupq_n_u32(0); vtmp012x0123.val[2] = vdupq_n_u32(0); uint32x4x3_t vtmp012x4567; vtmp012x4567.val[0] = vdupq_n_u32(0); vtmp012x4567.val[1] = vdupq_n_u32(0); vtmp012x4567.val[2] = vdupq_n_u32(0); vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); w0 += 3; vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); w1 += 3; vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); w2 += 3; vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); w3 += 3; vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); w4 += 3; vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); w5 += 3; vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); w6 += 3; vtmp012x4567 = vld3q_lane_u32(w7, vtmp012x4567, 3); w7 += 3; vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 7); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (8 - n); } else { const uint32x4_t vzero = vmovq_n_u32(0); vst1q_u32(packed_weights, vzero); packed_weights += 4; vst1q_u32(packed_weights, vzero); packed_weights += 4; } // NR remainder has less than 8 rows so last row is not loaded const uint32_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint32_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint32_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint32_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint32_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint32_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } // KC main loop multiple of 4 size_t k = kc; for (; k >= 4; k -= 4) { vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4; vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4; vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4; vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4; vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4; vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4; vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 1 case 1: { uint32x4_t vtmp0x0123 = vdupq_n_u32(0); uint32x4_t vtmp0x4567 = vdupq_n_u32(0); vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4; vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4; break; } // KC remainder of 2 case 2: { uint32x4x2_t vtmp01x0123; vtmp01x0123.val[0] = vdupq_n_u32(0); vtmp01x0123.val[1] = vdupq_n_u32(0); uint32x4x2_t vtmp01x4567; vtmp01x4567.val[0] = vdupq_n_u32(0); vtmp01x4567.val[1] = vdupq_n_u32(0); vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4; break; } // KC remainder of 3 case 3: { uint32x4x3_t vtmp012x0123; vtmp012x0123.val[0] = vdupq_n_u32(0); vtmp012x0123.val[1] = vdupq_n_u32(0); vtmp012x0123.val[2] = vdupq_n_u32(0); uint32x4x3_t vtmp012x4567; vtmp012x4567.val[0] = vdupq_n_u32(0); vtmp012x4567.val[1] = vdupq_n_u32(0); vtmp012x4567.val[2] = vdupq_n_u32(0); vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x32_packw_gemm_goi_ukernel_x8s4__neon_ld4lane_x4_prfm( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint32_t* weights, const uint32_t* bias, uint32_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); assert(kr == 1); assert(sr == 4); assert(weights != NULL); assert(packed_weights != NULL); uint32x4x4_t vtmp0123x0123; vtmp0123x0123.val[0] = vdupq_n_u32(0); vtmp0123x0123.val[1] = vdupq_n_u32(0); vtmp0123x0123.val[2] = vdupq_n_u32(0); vtmp0123x0123.val[3] = vdupq_n_u32(0); uint32x4x4_t vtmp0123x4567; vtmp0123x4567.val[0] = vdupq_n_u32(0); vtmp0123x4567.val[1] = vdupq_n_u32(0); vtmp0123x4567.val[2] = vdupq_n_u32(0); vtmp0123x4567.val[3] = vdupq_n_u32(0); do { // NC main loop multiple of 8 const uint32_t* w0 = weights; size_t n = nc; for (; n >= 8; n -= 8) { if XNN_LIKELY(bias != NULL) { uint32x4_t vb0 = vld1q_u32(bias); bias += 4; uint32x4_t vb4 = vld1q_u32(bias); bias += 4; vst1q_u32(packed_weights, vb0); packed_weights += 4; vst1q_u32(packed_weights, vb4); packed_weights += 4; } else { const uint32x4_t vzero = vmovq_n_u32(0); vst1q_u32(packed_weights, vzero); packed_weights += 4; vst1q_u32(packed_weights, vzero); packed_weights += 4; } const uint32_t* w1 = w0 + kc; const uint32_t* w2 = w1 + kc; const uint32_t* w3 = w2 + kc; const uint32_t* w4 = w3 + kc; const uint32_t* w5 = w4 + kc; const uint32_t* w6 = w5 + kc; const uint32_t* w7 = w6 + kc; xnn_prefetch_to_l1((const int8_t*) w0); xnn_prefetch_to_l1((const int8_t*) w0 + 64); xnn_prefetch_to_l1((const int8_t*) w1); xnn_prefetch_to_l1((const int8_t*) w1 + 64); xnn_prefetch_to_l1((const int8_t*) w2); xnn_prefetch_to_l1((const int8_t*) w2 + 64); xnn_prefetch_to_l1((const int8_t*) w3); xnn_prefetch_to_l1((const int8_t*) w3 + 64); xnn_prefetch_to_l1((const int8_t*) w4); xnn_prefetch_to_l1((const int8_t*) w4 + 64); xnn_prefetch_to_l1((const int8_t*) w5); xnn_prefetch_to_l1((const int8_t*) w5 + 64); xnn_prefetch_to_l1((const int8_t*) w6); xnn_prefetch_to_l1((const int8_t*) w6 + 64); xnn_prefetch_to_l1((const int8_t*) w7); xnn_prefetch_to_l1((const int8_t*) w7 + 64); // KC main loop multiple of 4 size_t k = kc; for (; k >= 4; k -= 4) { uint32x4_t vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w7, vtmp0123x4567, 3); w7 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); xnn_prefetch_to_l1((const int8_t*) w7 + 128); vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 1 case 1: { uint32x4_t vtmp0x0123 = vdupq_n_u32(0); uint32x4_t vtmp0x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp1x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp1x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); w0 += 1; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); w1 += 1; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); w2 += 1; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); w3 += 1; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); w4 += 1; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); w5 += 1; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); w6 += 1; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w7, vtmp0x4567, 3); w7 += 1; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4; vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp1x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp1x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } // KC remainder of 2 case 2: { uint32x4x2_t vtmp01x0123; vtmp01x0123.val[0] = vdupq_n_u32(0); vtmp01x0123.val[1] = vdupq_n_u32(0); uint32x4x2_t vtmp01x4567; vtmp01x4567.val[0] = vdupq_n_u32(0); vtmp01x4567.val[1] = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); w0 += 2; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); w1 += 2; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); w2 += 2; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); w3 += 2; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); w4 += 2; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); w5 += 2; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); w6 += 2; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w7, vtmp01x4567, 3); w7 += 2; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } // KC remainder of 3 case 3: { uint32x4x3_t vtmp012x0123; vtmp012x0123.val[0] = vdupq_n_u32(0); vtmp012x0123.val[1] = vdupq_n_u32(0); vtmp012x0123.val[2] = vdupq_n_u32(0); uint32x4x3_t vtmp012x4567; vtmp012x4567.val[0] = vdupq_n_u32(0); vtmp012x4567.val[1] = vdupq_n_u32(0); vtmp012x4567.val[2] = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); w0 += 3; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); w1 += 3; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); w2 += 3; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); w3 += 3; vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); w4 += 3; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); w5 += 3; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); w6 += 3; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w7, vtmp012x4567, 3); w7 += 3; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 7); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (8 - n); } else { const uint32x4_t vzero = vmovq_n_u32(0); vst1q_u32(packed_weights, vzero); packed_weights += 4; vst1q_u32(packed_weights, vzero); packed_weights += 4; } // NR remainder has less than 8 rows so last row is not loaded const uint32_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint32_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint32_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint32_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint32_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint32_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } // KC main loop multiple of 4 size_t k = kc; for (; k >= 4; k -= 4) { uint32x4_t vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w0, vtmp0123x0123, 0); w0 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w1, vtmp0123x0123, 1); w1 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w2, vtmp0123x0123, 2); w2 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x0123 = vld4q_lane_u32(w3, vtmp0123x0123, 3); w3 += 4; vsrtmp = vtmp0123x0123.val[3]; vtmp0123x0123.val[3] = vtmp0123x0123.val[2]; vtmp0123x0123.val[2] = vtmp0123x0123.val[1]; vtmp0123x0123.val[1] = vtmp0123x0123.val[0]; vtmp0123x0123.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w4, vtmp0123x4567, 0); w4 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w5, vtmp0123x4567, 1); w5 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vtmp0123x4567 = vld4q_lane_u32(w6, vtmp0123x4567, 2); w6 += 4; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; vsrtmp = vtmp0123x4567.val[3]; vtmp0123x4567.val[3] = vtmp0123x4567.val[2]; vtmp0123x4567.val[2] = vtmp0123x4567.val[1]; vtmp0123x4567.val[1] = vtmp0123x4567.val[0]; vtmp0123x4567.val[0] = vsrtmp; xnn_prefetch_to_l1((const int8_t*) w0 + 128); xnn_prefetch_to_l1((const int8_t*) w1 + 128); xnn_prefetch_to_l1((const int8_t*) w2 + 128); xnn_prefetch_to_l1((const int8_t*) w3 + 128); xnn_prefetch_to_l1((const int8_t*) w4 + 128); xnn_prefetch_to_l1((const int8_t*) w5 + 128); xnn_prefetch_to_l1((const int8_t*) w6 + 128); vst1q_u32(packed_weights, vtmp0123x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x0123.val[3]); packed_weights += 4; vst1q_u32(packed_weights, vtmp0123x4567.val[3]); packed_weights += 4; } // KC remainder of 1..3 // Same as main loop but ld1, ld2 or ld3 if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { // KC remainder of 1 case 1: { uint32x4_t vtmp0x0123 = vdupq_n_u32(0); uint32x4_t vtmp0x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp1x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp1x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp0x0123 = vld1q_lane_u32(w0, vtmp0x0123, 0); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w1, vtmp0x0123, 1); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w2, vtmp0x0123, 2); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x0123 = vld1q_lane_u32(w3, vtmp0x0123, 3); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vsrtmp1x0123; vsrtmp1x0123 = vtmp0x0123; vtmp0x0123 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w4, vtmp0x4567, 0); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w5, vtmp0x4567, 1); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vtmp0x4567 = vld1q_lane_u32(w6, vtmp0x4567, 2); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vsrtmp1x4567; vsrtmp1x4567 = vtmp0x4567; vtmp0x4567 = vsrtmp; vst1q_u32(packed_weights, vtmp0x0123); packed_weights += 4; vst1q_u32(packed_weights, vtmp0x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp1x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp1x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } // KC remainder of 2 case 2: { uint32x4x2_t vtmp01x0123; vtmp01x0123.val[0] = vdupq_n_u32(0); vtmp01x0123.val[1] = vdupq_n_u32(0); uint32x4x2_t vtmp01x4567; vtmp01x4567.val[0] = vdupq_n_u32(0); vtmp01x4567.val[1] = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp2x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp2x4567 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp01x0123 = vld2q_lane_u32(w0, vtmp01x0123, 0); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w1, vtmp01x0123, 1); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w2, vtmp01x0123, 2); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x0123 = vld2q_lane_u32(w3, vtmp01x0123, 3); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vsrtmp2x0123; vsrtmp2x0123 = vtmp01x0123.val[1]; vtmp01x0123.val[1] = vtmp01x0123.val[0]; vtmp01x0123.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w4, vtmp01x4567, 0); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w5, vtmp01x4567, 1); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vtmp01x4567 = vld2q_lane_u32(w6, vtmp01x4567, 2); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vsrtmp2x4567; vsrtmp2x4567 = vtmp01x4567.val[1]; vtmp01x4567.val[1] = vtmp01x4567.val[0]; vtmp01x4567.val[0] = vsrtmp; vst1q_u32(packed_weights, vtmp01x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp01x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp2x4567); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } // KC remainder of 3 case 3: { uint32x4x3_t vtmp012x0123; vtmp012x0123.val[0] = vdupq_n_u32(0); vtmp012x0123.val[1] = vdupq_n_u32(0); vtmp012x0123.val[2] = vdupq_n_u32(0); uint32x4x3_t vtmp012x4567; vtmp012x4567.val[0] = vdupq_n_u32(0); vtmp012x4567.val[1] = vdupq_n_u32(0); vtmp012x4567.val[2] = vdupq_n_u32(0); uint32x4_t vsrtmp; uint32x4_t vsrtmp3x0123 = vdupq_n_u32(0); uint32x4_t vsrtmp3x4567 = vdupq_n_u32(0); vtmp012x0123 = vld3q_lane_u32(w0, vtmp012x0123, 0); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w1, vtmp012x0123, 1); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w2, vtmp012x0123, 2); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x0123 = vld3q_lane_u32(w3, vtmp012x0123, 3); vsrtmp = vsrtmp3x0123; vsrtmp3x0123 = vtmp012x0123.val[2]; vtmp012x0123.val[2] = vtmp012x0123.val[1]; vtmp012x0123.val[1] = vtmp012x0123.val[0]; vtmp012x0123.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w4, vtmp012x4567, 0); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w5, vtmp012x4567, 1); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vtmp012x4567 = vld3q_lane_u32(w6, vtmp012x4567, 2); vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vsrtmp = vsrtmp3x4567; vsrtmp3x4567 = vtmp012x4567.val[2]; vtmp012x4567.val[2] = vtmp012x4567.val[1]; vtmp012x4567.val[1] = vtmp012x4567.val[0]; vtmp012x4567.val[0] = vsrtmp; vst1q_u32(packed_weights, vtmp012x0123.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[0]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[1]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x0123.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vtmp012x4567.val[2]); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x0123); packed_weights += 4; vst1q_u32(packed_weights, vsrtmp3x4567); packed_weights += 4; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x32_packx_ukernel_4x__neon_st4_x4_prfm( size_t m, size_t k, const uint32_t* x, size_t x_stride, uint32_t* restrict y) { assert(m != 0); assert(m <= 4); assert(k != 0); assert(x != NULL); assert(y != NULL); const uint32_t* x0 = x; const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride); if XNN_UNPREDICTABLE(m < 2) { x1 = x0; } const uint32_t* x2 = (const uint32_t*) ((uintptr_t) x1 + x_stride); if XNN_UNPREDICTABLE(m <= 2) { x2 = x1; } const uint32_t* x3 = (const uint32_t*) ((uintptr_t) x2 + x_stride); if XNN_UNPREDICTABLE(m != 4) { x3 = x2; } for (; k >= 4; k -= 4) { uint32x4x4_t vx0123x0123; vx0123x0123.val[0] = vld1q_u32(x0); x0 += 4; vx0123x0123.val[1] = vld1q_u32(x1); x1 += 4; vx0123x0123.val[2] = vld1q_u32(x2); x2 += 4; vx0123x0123.val[3] = vld1q_u32(x3); x3 += 4; xnn_prefetch_to_l1((const int8_t*) x0 + 128); xnn_prefetch_to_l1((const int8_t*) x1 + 128); xnn_prefetch_to_l1((const int8_t*) x2 + 128); xnn_prefetch_to_l1((const int8_t*) x3 + 128); vst4q_u32(y, vx0123x0123); y += 16; } if XNN_UNLIKELY(k != 0) { uint32x4_t vt0123 = vdupq_n_u32(0); do { vt0123 = vld1q_lane_u32(x0, vt0123, 0); x0 += 1; vt0123 = vld1q_lane_u32(x1, vt0123, 1); x1 += 1; vt0123 = vld1q_lane_u32(x2, vt0123, 2); x2 += 1; vt0123 = vld1q_lane_u32(x3, vt0123, 3); x3 += 1; xnn_prefetch_to_l1((const int8_t*) x0 + 128); xnn_prefetch_to_l1((const int8_t*) x1 + 128); xnn_prefetch_to_l1((const int8_t*) x2 + 128); xnn_prefetch_to_l1((const int8_t*) x3 + 128); vst1q_u32(y, vt0123); y += 4; } while (--k != 0); } } void xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon( const uint32_t* input, uint32_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); const size_t tile_height = 4; const size_t tile_width = 4; const size_t tile_hbytes = tile_height * sizeof(uint32_t); const size_t tile_wbytes = tile_width * sizeof(uint32_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t) - tile_hbytes; const uint32_t* i0 = input; uint32_t* o = (uint32_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 3); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 4; bh -= 4) { const uint32x4_t v2_0 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const uint32x4_t v2_1 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const uint32x4_t v2_2 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const uint32x4_t v2_3 = vld1q_u32(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const uint32x4x2_t v1_0 = vzipq_u32(v2_0, v2_2); const uint32x4x2_t v1_1 = vzipq_u32(v2_1, v2_3); const uint32x4x2_t v0_0 = vzipq_u32(v1_0.val[0], v1_1.val[0]); const uint32x4x2_t v0_1 = vzipq_u32(v1_0.val[1], v1_1.val[1]); o = (uint32_t*) ((uintptr_t) o + oN_offset); vst1q_u32(o, v0_1.val[1]); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u32(o, v0_1.val[0]); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u32(o, v0_0.val[1]); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u32(o, v0_0.val[0]); } o = (uint32_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const uint32x4_t v2_0 = vld1q_u32(i0); const uint32_t *i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const uint32x4_t v2_1 = vld1q_u32(i1); const uint32_t *i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const uint32x4_t v2_2 = vld1q_u32(i2); const uint32x4_t v2_3 = vmovq_n_u32(0); const uint32x4x2_t v1_0 = vzipq_u32(v2_0, v2_2); const uint32x4x2_t v1_1 = vzipq_u32(v2_1, v2_3); const uint32x4x2_t v0_0 = vzipq_u32(v1_0.val[0], v1_1.val[0]); const uint32x4x2_t v0_1 = vzipq_u32(v1_0.val[1], v1_1.val[1]); uint32x2_t v0_low = vget_low_u32(v0_0.val[0]); uint32x2_t v1_low = vget_low_u32(v0_0.val[1]); uint32x2_t v2_low = vget_low_u32(v0_1.val[0]); uint32x2_t v3_low = vget_low_u32(v0_1.val[1]); if (bh & 2) { o = (uint32_t*) ((uintptr_t) o + oN_stride); vst1_u32(o, v3_low); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_u32(o, v2_low); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_u32(o, v1_low); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_u32(o, v0_low); o += 2; v0_low = vget_high_u32(v0_0.val[0]); v1_low = vget_high_u32(v0_0.val[1]); v2_low = vget_high_u32(v0_1.val[0]); v3_low = vget_high_u32(v0_1.val[1]); } if (bh & 1) { o = (uint32_t*) ((uintptr_t) o + oN_stride); vst1_lane_u32(o, v3_low, 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32(o, v2_low, 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32(o, v1_low, 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint32_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32(o, v0_low, 0); } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); o = (uint32_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x32_unpool_ukernel__neon( size_t kernel_elements, size_t channels, uint32_t fill, const uint32_t* input, const uint32_t* index, uint32_t** output) { // Pre-initialize outputs with constant. const uint32x4_t vfill = vdupq_n_u32(fill); uint32_t** os = output; do { uint32_t* o = *os++; size_t c = channels; for (; c >= 4; c -= 4) { vst1q_u32(o, vfill); o += 4; } if (c != 0) { if (c & 2) { vst1_u32(o, vget_low_u32(vfill)); o += 2; } if (c & 1) { vst1q_lane_u32(o, vfill, 0); } } } while (--kernel_elements != 0); // Copy indexed elements to output. size_t offset = 0; do { const uint32_t i = *index++; *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++; offset += sizeof(uint32_t); } while (--channels != 0); } void xnn_x32_zip_x2_ukernel__neon( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % 4 == 0); const uint32_t* x = input; const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); uint32_t* o = output; while (n >= 16) { uint32x4x2_t vxy; vxy.val[0] = vld1q_u32(x); x += 4; vxy.val[1] = vld1q_u32(y); y += 4; vst2q_u32(o, vxy); o += 8; n -= 16; } if XNN_UNLIKELY(n != 0) { if (n & 8) { uint32x2x2_t vxy; vxy.val[0] = vld1_u32(x); x += 2; vxy.val[1] = vld1_u32(y); y += 2; vst2_u32(o, vxy); o += 4; } if (n & 4) { uint32x2_t vxy = vld1_dup_u32(x); vxy = vld1_lane_u32(y, vxy, 1); vst1_u32(o, vxy); } } } void xnn_x32_zip_x3_ukernel__neon( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % 4 == 0); const uint32_t* x = input; const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); uint32_t* o = output; while (n >= 16) { uint32x4x3_t vxyz; vxyz.val[0] = vld1q_u32(x); x += 4; vxyz.val[1] = vld1q_u32(y); y += 4; vxyz.val[2] = vld1q_u32(z); z += 4; vst3q_u32(o, vxyz); o += 12; n -= 16; } if XNN_UNLIKELY(n != 0) { if (n & 8) { uint32x2x3_t vxyz; vxyz.val[0] = vld1_u32(x); x += 2; vxyz.val[1] = vld1_u32(y); y += 2; vxyz.val[2] = vld1_u32(z); z += 2; vst3_u32(o, vxyz); o += 6; } if (n & 4) { uint32x2_t vxy = vld1_dup_u32(x); const uint32x2_t vz = vld1_dup_u32(z); vxy = vld1_lane_u32(y, vxy, 1); vst1_u32(o, vxy); o += 2; vst1_lane_u32(o, vz, 0); } } } void xnn_x32_zip_x4_ukernel__neon( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % 4 == 0); const uint32_t* x = input; const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n); uint32_t* o = output; while (n >= 16) { uint32x4x4_t vxyzw; vxyzw.val[0] = vld1q_u32(x); x += 4; vxyzw.val[1] = vld1q_u32(y); y += 4; vxyzw.val[2] = vld1q_u32(z); z += 4; vxyzw.val[3] = vld1q_u32(w); w += 4; vst4q_u32(o, vxyzw); o += 16; n -= 16; } if XNN_UNLIKELY(n != 0) { if (n & 8) { uint32x2x4_t vxyzw; vxyzw.val[0] = vld1_u32(x); x += 2; vxyzw.val[1] = vld1_u32(y); y += 2; vxyzw.val[2] = vld1_u32(z); z += 2; vxyzw.val[3] = vld1_u32(w); w += 2; vst4_u32(o, vxyzw); o += 8; } if (n & 4) { uint32x4_t vxyzw = vld1q_dup_u32(x); vxyzw = vld1q_lane_u32(y, vxyzw, 1); vxyzw = vld1q_lane_u32(z, vxyzw, 2); vxyzw = vld1q_lane_u32(w, vxyzw, 3); vst1q_u32(o, vxyzw); } } } void xnn_x32_zip_xm_ukernel__neon( size_t n, size_t m, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % 4 == 0); assert(m >= 4); const uint32_t* w = input; const size_t group_increment = m * 4; const size_t input_increment = n * 3; const size_t output_increment = 16 - m * n; const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1)); uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16)); for (size_t i = 0; i < m; i += 4) { w = (const uint32_t*) ((uintptr_t) w + input_increment); if (w >= last_input) { w = last_input; } const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n); const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n); const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n); size_t k = n; while (k >= 16) { const uint32x4_t vx = vld1q_u32(x); x += 4; const uint32x4_t vy = vld1q_u32(y); y += 4; const uint32x4_t vz = vld1q_u32(z); z += 4; const uint32x4_t vw = vld1q_u32(w); w += 4; const uint32x4x2_t vxy = vzipq_u32(vx, vy); const uint32x4x2_t vzw = vzipq_u32(vz, vw); vst1_u32(output, vget_low_u32(vxy.val[0])); vst1_u32(output + 2, vget_low_u32(vzw.val[0])); output = (uint32_t*) ((uintptr_t) output + group_increment); vst1_u32(output, vget_high_u32(vxy.val[0])); vst1_u32(output + 2, vget_high_u32(vzw.val[0])); output = (uint32_t*) ((uintptr_t) output + group_increment); vst1_u32(output, vget_low_u32(vxy.val[1])); vst1_u32(output + 2, vget_low_u32(vzw.val[1])); output = (uint32_t*) ((uintptr_t) output + group_increment); vst1_u32(output, vget_high_u32(vxy.val[1])); vst1_u32(output + 2, vget_high_u32(vzw.val[1])); output = (uint32_t*) ((uintptr_t) output + group_increment); k -= 16; } if XNN_UNLIKELY(k != 0) { if (k & 8) { const uint32x2_t vx = vld1_u32(x); x += 2; const uint32x2_t vy = vld1_u32(y); y += 2; const uint32x2_t vz = vld1_u32(z); z += 2; const uint32x2_t vw = vld1_u32(w); w += 2; const uint32x2x2_t vxy = vzip_u32(vx, vy); const uint32x2x2_t vzw = vzip_u32(vz, vw); vst1_u32(output, vxy.val[0]); vst1_u32(output + 2, vzw.val[0]); output = (uint32_t*) ((uintptr_t) output + group_increment); vst1_u32(output, vxy.val[1]); vst1_u32(output + 2, vzw.val[1]); output = (uint32_t*) ((uintptr_t) output + group_increment); } if (k & 4) { const uint32x2_t vx = vld1_dup_u32(x); const uint32x2_t vz = vld1_dup_u32(z); const uint32x2_t vxy = vld1_lane_u32(y, vx, 1); const uint32x2_t vzw = vld1_lane_u32(w, vz, 1); w += 1; vst1_u32(output, vxy); vst1_u32(output + 2, vzw); output = (uint32_t*) ((uintptr_t) output + group_increment); } } output = (uint32_t*) ((uintptr_t) output + output_increment); if (output > last_output) { output = last_output; } } } void xnn_x8_packw_gemm_goi_ukernel_x8__scalar_int_x4( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const int8_t* weights, const uint32_t* bias, int8_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); // This kernel is for NR=8 assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); int8_t* out = (int8_t*) packed_weights; const uint32_t* b = (const uint32_t*) bias; do { // NC main loop multiple of 8 const int8_t* w0 = (const int8_t*) weights; size_t n = nc; for (;n >= 8; n -= 8) { if XNN_LIKELY(b != NULL) { ((uint32_t*) out)[0] = b[0]; ((uint32_t*) out)[1] = b[1]; ((uint32_t*) out)[2] = b[2]; ((uint32_t*) out)[3] = b[3]; ((uint32_t*) out)[4] = b[4]; ((uint32_t*) out)[5] = b[5]; ((uint32_t*) out)[6] = b[6]; ((uint32_t*) out)[7] = b[7]; b += 8; } else { ((uint32_t*) out)[0] = 0; ((uint32_t*) out)[1] = 0; ((uint32_t*) out)[2] = 0; ((uint32_t*) out)[3] = 0; ((uint32_t*) out)[4] = 0; ((uint32_t*) out)[5] = 0; ((uint32_t*) out)[6] = 0; ((uint32_t*) out)[7] = 0; } out += 8 * sizeof(uint32_t); const int8_t* w1 = w0 + kc; const int8_t* w2 = w1 + kc; const int8_t* w3 = w2 + kc; const int8_t* w4 = w3 + kc; const int8_t* w5 = w4 + kc; const int8_t* w6 = w5 + kc; const int8_t* w7 = w6 + kc; // KC main loop multiple of 8x4 size_t k = kc; for (; k >= 4; k -= 4) { const int8_t v00 = w0[0]; const int8_t v01 = w0[1]; const int8_t v02 = w0[2]; const int8_t v03 = w0[3]; w0 += 4; const int8_t v10 = w1[0]; const int8_t v11 = w1[1]; const int8_t v12 = w1[2]; const int8_t v13 = w1[3]; w1 += 4; const int8_t v20 = w2[0]; const int8_t v21 = w2[1]; const int8_t v22 = w2[2]; const int8_t v23 = w2[3]; w2 += 4; const int8_t v30 = w3[0]; const int8_t v31 = w3[1]; const int8_t v32 = w3[2]; const int8_t v33 = w3[3]; w3 += 4; const int8_t v40 = w4[0]; const int8_t v41 = w4[1]; const int8_t v42 = w4[2]; const int8_t v43 = w4[3]; w4 += 4; const int8_t v50 = w5[0]; const int8_t v51 = w5[1]; const int8_t v52 = w5[2]; const int8_t v53 = w5[3]; w5 += 4; const int8_t v60 = w6[0]; const int8_t v61 = w6[1]; const int8_t v62 = w6[2]; const int8_t v63 = w6[3]; w6 += 4; const int8_t v70 = w7[0]; const int8_t v71 = w7[1]; const int8_t v72 = w7[2]; const int8_t v73 = w7[3]; w7 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[7] = v70; out[8] = v01; out[9] = v11; out[10] = v21; out[11] = v31; out[12] = v41; out[13] = v51; out[14] = v61; out[15] = v71; out[16] = v02; out[17] = v12; out[18] = v22; out[19] = v32; out[20] = v42; out[21] = v52; out[22] = v62; out[23] = v72; out[24] = v03; out[25] = v13; out[26] = v23; out[27] = v33; out[28] = v43; out[29] = v53; out[30] = v63; out[31] = v73; out += 32; } // KC remainder for (; k != 0; --k) { const int8_t v0 = *w0++; out[0] = v0; const int8_t v1 = *w1++; out[1] = v1; const int8_t v2 = *w2++; out[2] = v2; const int8_t v3 = *w3++; out[3] = v3; const int8_t v4 = *w4++; out[4] = v4; const int8_t v5 = *w5++; out[5] = v5; const int8_t v6 = *w6++; out[6] = v6; const int8_t v7 = *w7++; out[7] = v7; out += 8; } out = (int8_t*) ((uintptr_t) out + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { if XNN_LIKELY(b != NULL) { size_t nb = n; do { *((uint32_t*) out) = *b++; out += sizeof(uint32_t); } while (--nb != 0); } else { size_t nb = n; do { *((uint32_t*) out) = 0; out += sizeof(uint32_t); } while (--nb != 0); } out += (8 - n) * sizeof(uint32_t); // NR remainder has less than 8 rows so last row is not loaded const int8_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const int8_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const int8_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const int8_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const int8_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const int8_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } // KC main loop multiple of 8x4 size_t k = kc; for (; k >= 4; k -= 4) { const int8_t v00 = w0[0]; const int8_t v01 = w0[1]; const int8_t v02 = w0[2]; const int8_t v03 = w0[3]; w0 += 4; const int8_t v10 = w1[0]; const int8_t v11 = w1[1]; const int8_t v12 = w1[2]; const int8_t v13 = w1[3]; w1 += 4; const int8_t v20 = w2[0]; const int8_t v21 = w2[1]; const int8_t v22 = w2[2]; const int8_t v23 = w2[3]; w2 += 4; const int8_t v30 = w3[0]; const int8_t v31 = w3[1]; const int8_t v32 = w3[2]; const int8_t v33 = w3[3]; w3 += 4; const int8_t v40 = w4[0]; const int8_t v41 = w4[1]; const int8_t v42 = w4[2]; const int8_t v43 = w4[3]; w4 += 4; const int8_t v50 = w5[0]; const int8_t v51 = w5[1]; const int8_t v52 = w5[2]; const int8_t v53 = w5[3]; w5 += 4; const int8_t v60 = w6[0]; const int8_t v61 = w6[1]; const int8_t v62 = w6[2]; const int8_t v63 = w6[3]; w6 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[8] = v01; out[9] = v11; out[10] = v21; out[11] = v31; out[12] = v41; out[13] = v51; out[14] = v61; out[16] = v02; out[17] = v12; out[18] = v22; out[19] = v32; out[20] = v42; out[21] = v52; out[22] = v62; out[24] = v03; out[25] = v13; out[26] = v23; out[27] = v33; out[28] = v43; out[29] = v53; out[30] = v63; out += 32; } // KC remainder of 1..3 for (; k != 0; --k) { const int8_t v0 = *w0++; out[0] = v0; const int8_t v1 = *w1++; out[1] = v1; const int8_t v2 = *w2++; out[2] = v2; const int8_t v3 = *w3++; out[3] = v3; const int8_t v4 = *w4++; out[4] = v4; const int8_t v5 = *w5++; out[5] = v5; const int8_t v6 = *w6++; out[6] = v6; out += 8; } out = (int8_t*) ((uintptr_t) out + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon( const uint8_t* input, uint8_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x8_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint8_t)); assert(input_stride >= block_width * sizeof(uint8_t)); const size_t tile_height = 16; const size_t tile_width = 16; const size_t tile_hbytes = tile_height * sizeof(uint8_t); const size_t tile_wbytes = tile_width * sizeof(uint8_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes; const uint8_t* i0 = input; uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 15); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 16; bh -= 16) { const uint8x16_t v4_0 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_1 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_2 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_3 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_4 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_5 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_6 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_7 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_8 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_9 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_10 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_11 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_12 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_13 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_14 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16_t v4_15 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8); const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9); const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10); const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11); const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12); const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13); const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14); const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15); const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]); const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]); const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]); const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]); const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]); const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]); const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]); const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]); const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]); const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]); const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]); const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]); const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]); const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]); const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]); const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]); const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]); const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]); const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]); const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]); const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]); const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]); const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]); const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]); o = (uint8_t*) ((uintptr_t) o + oN_offset); vst1q_u8(o, v0_7.val[1]); if XNN_UNPREDICTABLE(block_width > 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_7.val[0]); if XNN_UNPREDICTABLE(block_width >= 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_6.val[1]); if XNN_UNPREDICTABLE(block_width > 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_6.val[0]); if XNN_UNPREDICTABLE(block_width >= 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_5.val[1]); if XNN_UNPREDICTABLE(block_width > 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_5.val[0]); if XNN_UNPREDICTABLE(block_width >= 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_4.val[1]); if XNN_UNPREDICTABLE(block_width > 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_4.val[0]); if XNN_UNPREDICTABLE(block_width >= 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_3.val[1]); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_3.val[0]); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_2.val[1]); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_2.val[0]); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_1.val[1]); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_1.val[0]); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_0.val[1]); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1q_u8(o, v0_0.val[0]); } o = (uint8_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const uint8x16_t v4_0 = vld1q_u8(i0); const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const uint8x16_t v4_1 = vld1q_u8(i1); const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const uint8x16_t v4_2 = vld1q_u8(i2); const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(bh < 4) { i3 = i2; } const uint8x16_t v4_3 = vld1q_u8(i3); const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(bh <= 4) { i4 = i3; } const uint8x16_t v4_4 = vld1q_u8(i4); const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(bh < 6) { i5 = i4; } const uint8x16_t v4_5 = vld1q_u8(i5); const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(bh <= 6) { i6 = i5; } const uint8x16_t v4_6 = vld1q_u8(i6); const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride); if XNN_UNPREDICTABLE(bh < 8) { i7 = i6; } const uint8x16_t v4_7 = vld1q_u8(i7); const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride); if XNN_UNPREDICTABLE(bh <= 8) { i8 = i7; } const uint8x16_t v4_8 = vld1q_u8(i8); const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride); if XNN_UNPREDICTABLE(bh < 10) { i9 = i8; } const uint8x16_t v4_9 = vld1q_u8(i9); const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride); if XNN_UNPREDICTABLE(bh <= 10) { i10 = i9; } const uint8x16_t v4_10 = vld1q_u8(i10); const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride); if XNN_UNPREDICTABLE(bh < 12) { i11 = i10; } const uint8x16_t v4_11 = vld1q_u8(i11); const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride); if XNN_UNPREDICTABLE(bh <= 12) { i12 = i11; } const uint8x16_t v4_12 = vld1q_u8(i12); const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride); if XNN_UNPREDICTABLE(bh < 14) { i13 = i12; } const uint8x16_t v4_13 = vld1q_u8(i13); const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride); if XNN_UNPREDICTABLE(bh <= 14) { i14 = i13; } const uint8x16_t v4_14 = vld1q_u8(i14); const uint8x16_t v4_15 = vmovq_n_u8(0); const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8); const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9); const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10); const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11); const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12); const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13); const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14); const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15); const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]); const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]); const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]); const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]); const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]); const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]); const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]); const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]); const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]); const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]); const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]); const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]); const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]); const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]); const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]); const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]); const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]); const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]); const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]); const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]); const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]); const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]); const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]); const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]); uint8x8_t v0_low = vget_low_u8(v0_0.val[0]); uint8x8_t v1_low = vget_low_u8(v0_0.val[1]); uint8x8_t v2_low = vget_low_u8(v0_1.val[0]); uint8x8_t v3_low = vget_low_u8(v0_1.val[1]); uint8x8_t v4_low = vget_low_u8(v0_2.val[0]); uint8x8_t v5_low = vget_low_u8(v0_2.val[1]); uint8x8_t v6_low = vget_low_u8(v0_3.val[0]); uint8x8_t v7_low = vget_low_u8(v0_3.val[1]); uint8x8_t v8_low = vget_low_u8(v0_4.val[0]); uint8x8_t v9_low = vget_low_u8(v0_4.val[1]); uint8x8_t v10_low = vget_low_u8(v0_5.val[0]); uint8x8_t v11_low = vget_low_u8(v0_5.val[1]); uint8x8_t v12_low = vget_low_u8(v0_6.val[0]); uint8x8_t v13_low = vget_low_u8(v0_6.val[1]); uint8x8_t v14_low = vget_low_u8(v0_7.val[0]); uint8x8_t v15_low = vget_low_u8(v0_7.val[1]); if (bh & 8) { o = (uint8_t*) ((uintptr_t) o + oN_stride); vst1_u8(o, v15_low); if XNN_UNPREDICTABLE(block_width > 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v14_low); if XNN_UNPREDICTABLE(block_width >= 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v13_low); if XNN_UNPREDICTABLE(block_width > 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v12_low); if XNN_UNPREDICTABLE(block_width >= 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v11_low); if XNN_UNPREDICTABLE(block_width > 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v10_low); if XNN_UNPREDICTABLE(block_width >= 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v9_low); if XNN_UNPREDICTABLE(block_width > 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v8_low); if XNN_UNPREDICTABLE(block_width >= 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v7_low); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v6_low); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v5_low); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v4_low); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v3_low); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v2_low); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v1_low); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_u8(o, v0_low); o += 8; v0_low = vget_high_u8(v0_0.val[0]); v1_low = vget_high_u8(v0_0.val[1]); v2_low = vget_high_u8(v0_1.val[0]); v3_low = vget_high_u8(v0_1.val[1]); v4_low = vget_high_u8(v0_2.val[0]); v5_low = vget_high_u8(v0_2.val[1]); v6_low = vget_high_u8(v0_3.val[0]); v7_low = vget_high_u8(v0_3.val[1]); v8_low = vget_high_u8(v0_4.val[0]); v9_low = vget_high_u8(v0_4.val[1]); v10_low = vget_high_u8(v0_5.val[0]); v11_low = vget_high_u8(v0_5.val[1]); v12_low = vget_high_u8(v0_6.val[0]); v13_low = vget_high_u8(v0_6.val[1]); v14_low = vget_high_u8(v0_7.val[0]); v15_low = vget_high_u8(v0_7.val[1]); } if (bh & 4) { o = (uint8_t*) ((uintptr_t) o + oN_stride); vst1_lane_u32((void*) o, vreinterpret_u32_u8(v15_low), 0); if XNN_UNPREDICTABLE(block_width > 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v14_low), 0); if XNN_UNPREDICTABLE(block_width >= 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v13_low), 0); if XNN_UNPREDICTABLE(block_width > 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v12_low), 0); if XNN_UNPREDICTABLE(block_width >= 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v11_low), 0); if XNN_UNPREDICTABLE(block_width > 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v10_low), 0); if XNN_UNPREDICTABLE(block_width >= 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v9_low), 0); if XNN_UNPREDICTABLE(block_width > 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v8_low), 0); if XNN_UNPREDICTABLE(block_width >= 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v7_low), 0); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v6_low), 0); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v5_low), 0); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v4_low), 0); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v3_low), 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v2_low), 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v1_low), 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u32((void*) o, vreinterpret_u32_u8(v0_low), 0); o += 4; v0_low = vext_u8(v0_low, v0_low, 4); v1_low = vext_u8(v1_low, v1_low, 4); v2_low = vext_u8(v2_low, v2_low, 4); v3_low = vext_u8(v3_low, v3_low, 4); v4_low = vext_u8(v4_low, v4_low, 4); v5_low = vext_u8(v5_low, v5_low, 4); v6_low = vext_u8(v6_low, v6_low, 4); v7_low = vext_u8(v7_low, v7_low, 4); v8_low = vext_u8(v8_low, v8_low, 4); v9_low = vext_u8(v9_low, v9_low, 4); v10_low = vext_u8(v10_low, v10_low, 4); v11_low = vext_u8(v11_low, v11_low, 4); v12_low = vext_u8(v12_low, v12_low, 4); v13_low = vext_u8(v13_low, v13_low, 4); v14_low = vext_u8(v14_low, v14_low, 4); v15_low = vext_u8(v15_low, v15_low, 4); } if (bh & 2) { o = (uint8_t*) ((uintptr_t) o + oN_stride); vst1_lane_u16((void*) o, vreinterpret_u16_u8(v15_low), 0); if XNN_UNPREDICTABLE(block_width > 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v14_low), 0); if XNN_UNPREDICTABLE(block_width >= 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v13_low), 0); if XNN_UNPREDICTABLE(block_width > 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v12_low), 0); if XNN_UNPREDICTABLE(block_width >= 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v11_low), 0); if XNN_UNPREDICTABLE(block_width > 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v10_low), 0); if XNN_UNPREDICTABLE(block_width >= 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v9_low), 0); if XNN_UNPREDICTABLE(block_width > 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v8_low), 0); if XNN_UNPREDICTABLE(block_width >= 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v7_low), 0); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v6_low), 0); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v5_low), 0); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v4_low), 0); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v3_low), 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v2_low), 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v1_low), 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u16((void*) o, vreinterpret_u16_u8(v0_low), 0); o += 2; v0_low = vext_u8(v0_low, v0_low, 2); v1_low = vext_u8(v1_low, v1_low, 2); v2_low = vext_u8(v2_low, v2_low, 2); v3_low = vext_u8(v3_low, v3_low, 2); v4_low = vext_u8(v4_low, v4_low, 2); v5_low = vext_u8(v5_low, v5_low, 2); v6_low = vext_u8(v6_low, v6_low, 2); v7_low = vext_u8(v7_low, v7_low, 2); v8_low = vext_u8(v8_low, v8_low, 2); v9_low = vext_u8(v9_low, v9_low, 2); v10_low = vext_u8(v10_low, v10_low, 2); v11_low = vext_u8(v11_low, v11_low, 2); v12_low = vext_u8(v12_low, v12_low, 2); v13_low = vext_u8(v13_low, v13_low, 2); v14_low = vext_u8(v14_low, v14_low, 2); v15_low = vext_u8(v15_low, v15_low, 2); } if (bh & 1) { o = (uint8_t*) ((uintptr_t) o + oN_stride); vst1_lane_u8(o, v15_low, 0); if XNN_UNPREDICTABLE(block_width > 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v14_low, 0); if XNN_UNPREDICTABLE(block_width >= 15) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v13_low, 0); if XNN_UNPREDICTABLE(block_width > 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v12_low, 0); if XNN_UNPREDICTABLE(block_width >= 13) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v11_low, 0); if XNN_UNPREDICTABLE(block_width > 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v10_low, 0); if XNN_UNPREDICTABLE(block_width >= 11) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v9_low, 0); if XNN_UNPREDICTABLE(block_width > 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v8_low, 0); if XNN_UNPREDICTABLE(block_width >= 9) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v7_low, 0); if XNN_UNPREDICTABLE(block_width > 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v6_low, 0); if XNN_UNPREDICTABLE(block_width >= 7) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v5_low, 0); if XNN_UNPREDICTABLE(block_width > 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v4_low, 0); if XNN_UNPREDICTABLE(block_width >= 5) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v3_low, 0); if XNN_UNPREDICTABLE(block_width > 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v2_low, 0); if XNN_UNPREDICTABLE(block_width >= 3) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v1_low, 0); if XNN_UNPREDICTABLE(block_width > 1) { o = (uint8_t*) ((uintptr_t) o + minus_output_stride); } vst1_lane_u8(o, v0_low, 0); } } i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset); o = (uint8_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x8_zip_x2_ukernel__neon( size_t n, const uint8_t* input, uint8_t* output) { const uint8_t* x = input; const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); uint8_t* o = output; if (n >= 8) { do { uint8x8x2_t vxy; vxy.val[0] = vld1_u8(x); x += 8; vxy.val[1] = vld1_u8(y); y += 8; vst2_u8(o, vxy); o += 16;; n -= 8; } while (n >= 8); if (n != 0) { const size_t address_increment = n - 8; uint8x8x2_t vxy; vxy.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); vxy.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); vst2_u8((uint8_t*) ((uintptr_t) o + address_increment * 2), vxy); } } else { do { const uint8_t vx = *x++; const uint8_t vy = *y++; o[0] = vx; o[1] = vy; o += 2; } while (--n != 0); } } void xnn_x8_zip_x3_ukernel__neon( size_t n, const uint8_t* input, uint8_t* output) { const uint8_t* x = input; const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); uint8_t* o = output; if (n >= 8) { do { uint8x8x3_t vxyz; vxyz.val[0] = vld1_u8(x); x += 8; vxyz.val[1] = vld1_u8(y); y += 8; vxyz.val[2] = vld1_u8(z); z += 8; vst3_u8(o, vxyz); o += 24; n -= 8; } while (n >= 8); if (n != 0) { const size_t address_increment = n - 8; uint8x8x3_t vxyz; vxyz.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); vxyz.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); vxyz.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment)); vst3_u8((uint8_t*) ((uintptr_t) o + address_increment * 3), vxyz); } } else { do { const uint8_t vx = *x++; const uint8_t vy = *y++; const uint8_t vz = *z++; o[0] = vx; o[1] = vy; o[2] = vz; o += 3; } while (--n != 0); } } void xnn_x8_zip_x4_ukernel__neon( size_t n, const uint8_t* input, uint8_t* output) { const uint8_t* x = input; const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); uint8_t* o = output; if (n >= 8) { do { uint8x8x4_t vxyzw; vxyzw.val[0] = vld1_u8(x); x += 8; vxyzw.val[1] = vld1_u8(y); y += 8; vxyzw.val[2] = vld1_u8(z); z += 8; vxyzw.val[3] = vld1_u8(w); w += 8; vst4_u8(o, vxyzw); o += 32; n -= 8; } while (n >= 8); if (n != 0) { const size_t address_increment = n - 8; uint8x8x4_t vxyzw; vxyzw.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); vxyzw.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); vxyzw.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment)); vxyzw.val[3] = vld1_u8((const uint8_t*) ((uintptr_t) w + address_increment)); vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw); } } else { do { const uint8_t vx = *x++; const uint8_t vy = *y++; const uint8_t vz = *z++; const uint8_t vw = *w++; o[0] = vx; o[1] = vy; o[2] = vz; o[3] = vw; o += 4; } while (--n != 0); } } void xnn_x8_zip_xm_ukernel__neon( size_t n, size_t m, const uint8_t* input, uint8_t* output) { const uint8_t* w = input; const size_t input_increment = n * 3; const size_t output_increment = 4 - m * n; const uint8_t* last_input = w + n * (m - 1); uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4)); if (n >= 8) { for (size_t i = 0; i < m; i += 4) { size_t k = n; w = (const uint8_t*) ((uintptr_t) w + input_increment); if (w >= last_input) { w = last_input; } const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n); const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n); const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n); while (k >= 8) { const uint8x8_t vx = vld1_u8(x); x += 8; const uint8x8_t vy = vld1_u8(y); y += 8; const uint8x8_t vz = vld1_u8(z); z += 8; const uint8x8_t vw = vld1_u8(w); w += 8; const uint8x8x2_t vxy = vzip_u8(vx, vy); const uint8x8x2_t vzw = vzip_u8(vz, vw); const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0])); const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1])); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 1); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 1); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 1); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 1); output = (uint8_t*) ((uintptr_t) output + m); k -= 8; } if (k != 0) { const size_t address_increment = k - 8; x = (const uint8_t*) ((uintptr_t) x + address_increment); y = (const uint8_t*) ((uintptr_t) y + address_increment); z = (const uint8_t*) ((uintptr_t) z + address_increment); w = (const uint8_t*) ((uintptr_t) w + address_increment); const int64x1_t vshift = vmov_n_s64(8 * address_increment); const uint64x1_t vx = vshl_u64(vreinterpret_u64_u8(vld1_u8(x)), vshift); const uint64x1_t vy = vshl_u64(vreinterpret_u64_u8(vld1_u8(y)), vshift); const uint64x1_t vz = vshl_u64(vreinterpret_u64_u8(vld1_u8(z)), vshift); const uint64x1_t vw = vshl_u64(vreinterpret_u64_u8(vld1_u8(w)), vshift); w += 8; const uint8x8x2_t vxy = vzip_u8(vreinterpret_u8_u64(vx), vreinterpret_u8_u64(vy)); const uint8x8x2_t vzw = vzip_u8(vreinterpret_u8_u64(vz), vreinterpret_u8_u64(vw)); const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0])); const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1])); uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]); uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]); uint32x2_t vxyzw2 = vreinterpret_u32_u16(vxyzw_hi.val[0]); uint32x2_t vxyzw3 = vreinterpret_u32_u16(vxyzw_hi.val[1]); if (k & 4) { vst1_lane_u32((void*) output, vxyzw0, 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vxyzw0, 1); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vxyzw1, 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vxyzw1, 1); output = (uint8_t*) ((uintptr_t) output + m); vxyzw0 = vxyzw2; vxyzw1 = vxyzw3; } if (k & 2) { vst1_lane_u32((void*) output, vxyzw0, 0); output = (uint8_t*) ((uintptr_t) output + m); vst1_lane_u32((void*) output, vxyzw0, 1); output = (uint8_t*) ((uintptr_t) output + m); vxyzw0 = vxyzw1; } if (k & 1) { vst1_lane_u32((void*) output, vxyzw0, 0); output = (uint8_t*) ((uintptr_t) output + m); } } output = (uint8_t*) ((uintptr_t) output + output_increment); if (output > last_output) { output = last_output; } } } else { const uint8_t* i = input; uint8_t* o = output; size_t k = n; do { size_t l = m; const uint8_t* ii = i++; do { *o++ = *ii; ii += n; } while (--l != 0); } while (--k != 0); } } void xnn_xx_fill_ukernel__neon_x64( size_t rows, size_t channels, void* output, size_t output_stride, const uint32_t fill_pattern) { assert(rows != 0); assert(channels != 0); const size_t output_increment = output_stride - channels; const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern)); do { size_t c = channels; for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) { vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16); vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16); vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16); vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16); } for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) { vst1q_u8(output, vfill_pattern); output = ((uint8_t*) output + 16); } if XNN_UNLIKELY(c != 0) { if XNN_LIKELY(c & (8 * sizeof(uint8_t))) { vst1_u8(output, vget_low_u8(vfill_pattern)); output = ((uint8_t*) output + 8); } if XNN_LIKELY(c & (4 * sizeof(uint8_t))) { vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = ((uint8_t*) output + 4); } uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern); if XNN_LIKELY(c & (2 * sizeof(uint8_t))) { vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = ((uint8_t*) output + 2); vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2); } if XNN_LIKELY(c & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vfill_subpattern, 0); output = ((uint8_t*) output + 1); } } output = (void*) ((uintptr_t) output + output_increment); } while (--rows != 0); } void xnn_xx_pad_ukernel__neon( size_t rows, size_t channels, size_t pre_padding, size_t post_padding, const void* input, size_t input_stride, void* output, size_t output_stride, uint32_t fill_pattern) XNN_OOB_READS { const size_t input_increment = input_stride - channels; const size_t output_increment = output_stride - (pre_padding + channels + post_padding); const uint8x16_t vfill_pattern = vreinterpretq_u8_u32(vdupq_n_u32(fill_pattern)); do { // Pre-pad input channels. size_t l = pre_padding; if XNN_LIKELY(l != 0) { for (; l >= 16 * sizeof(uint8_t); l -= 16 * sizeof(uint8_t)) { vst1q_u8(output, vfill_pattern); output = (uint8_t*) output + 16; } if (l & (8 * sizeof(uint8_t))) { vst1_u8(output, vget_low_u8(vfill_pattern)); output = (uint8_t*) output + 8; } if (l & (4 * sizeof(uint8_t))) { vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = (uint8_t*) output + 4; } uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern); if (l & (2 * sizeof(uint8_t))) { vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = (uint8_t*) output + 2; vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2); } if (l & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vfill_subpattern, 0); output = (uint8_t*) output + 1; } } // Copy input channels. size_t c = channels; for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) { const uint8x16_t vdata = vld1q_u8(input); input = (const uint8_t*) input + 16; vst1q_u8(output, vdata); output = (uint8_t*) output + 16; } if XNN_UNLIKELY(c != 0) { uint8x16_t vdata = vld1q_u8(input); input = (const void*) ((uintptr_t) input + c); uint8x8_t vsubdata = vget_low_u8(vdata); if (c & (8 * sizeof(uint8_t))) { vst1_u8(output, vsubdata); output = (uint8_t*) output + 8; vsubdata = vget_high_u8(vdata); } if (c & (4 * sizeof(uint8_t))) { vst1_lane_u32(output, vreinterpret_u32_u8(vsubdata), 0); output = (uint8_t*) output + 4; vsubdata = vext_u8(vsubdata, vsubdata, 4); } if (c & (2 * sizeof(uint8_t))) { vst1_lane_u16(output, vreinterpret_u16_u8(vsubdata), 0); output = (uint8_t*) output + 2; vsubdata = vext_u8(vsubdata, vsubdata, 2); } if (c & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vsubdata, 0); output = (uint8_t*) output + 1; } } // Post-pad input channels. size_t r = post_padding; if XNN_LIKELY(r != 0) { for (; r >= 16 * sizeof(uint8_t); r -= 16 * sizeof(uint8_t)) { vst1q_u8(output, vfill_pattern); output = (uint8_t*) output + 16; } if (r & (8 * sizeof(uint8_t))) { vst1_u8(output, vget_low_u8(vfill_pattern)); output = (uint8_t*) output + 8; } if (r & (4 * sizeof(uint8_t))) { vst1q_lane_u32(output, vreinterpretq_u32_u8(vfill_pattern), 0); output = (uint8_t*) output + 4; } uint8x8_t vfill_subpattern = vget_low_u8(vfill_pattern); if (r & (2 * sizeof(uint8_t))) { vst1_lane_u16(output, vreinterpret_u16_u8(vfill_subpattern), 0); output = (uint8_t*) output + 2; vfill_subpattern = vext_u8(vfill_subpattern, vfill_subpattern, 2); } if (r & (1 * sizeof(uint8_t))) { vst1_lane_u8(output, vfill_subpattern, 0); output = (uint8_t*) output + 1; } } input = (const uint32_t*) ((uintptr_t) input + input_increment); output = (uint32_t*) ((uintptr_t) output + output_increment); } while (--rows != 0); }