// Copyright 2021 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16( size_t batch, const void* input, float* output, const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { const v128_t vh0 = wasm_v128_load(i); const v128_t vh1 = wasm_v128_load(i + 8); i += 16; const v128_t vsign0 = wasm_v128_and(vh0, vsign_mask); const v128_t vsign1 = wasm_v128_and(vh1, vsign_mask); const v128_t vnonsign0 = wasm_v128_xor(vh0, vsign0); const v128_t vnonsign1 = wasm_v128_xor(vh1, vsign1); const v128_t vprenorm0 = wasm_i16x8_shl(vnonsign0, 13); const v128_t vprenorm1 = wasm_i16x8_add(wasm_u16x8_shr(vnonsign0, 3), vexp_offset); const v128_t vprenorm2 = wasm_i16x8_shl(vnonsign1, 13); const v128_t vprenorm3 = wasm_i16x8_add(wasm_u16x8_shr(vnonsign1, 3), vexp_offset); const v128_t vnorm0 = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm0, vprenorm1, 0, 8, 1, 9, 2, 10, 3, 11), vexp_scale); const v128_t vnorm1 = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm0, vprenorm1, 4, 12, 5, 13, 6, 14, 7, 15), vexp_scale); const v128_t vnorm2 = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm2, vprenorm3, 0, 8, 1, 9, 2, 10, 3, 11), vexp_scale); const v128_t vnorm3 = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm2, vprenorm3, 4, 12, 5, 13, 6, 14, 7, 15), vexp_scale); const v128_t vdenorm0 = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign0, vmagic_mask, 0, 8, 1, 9, 2, 10, 3, 11), vmagic_bias); const v128_t vdenorm1 = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign0, vmagic_mask, 4, 12, 5, 13, 6, 14, 7, 15), vmagic_bias); const v128_t vdenorm2 = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign1, vmagic_mask, 0, 8, 1, 9, 2, 10, 3, 11), vmagic_bias); const v128_t vdenorm3 = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign1, vmagic_mask, 4, 12, 5, 13, 6, 14, 7, 15), vmagic_bias); const v128_t vmask0 = wasm_i16x8_gt(vnonsign0, vdenorm_cutoff); const v128_t vmask1 = wasm_i16x8_gt(vnonsign1, vdenorm_cutoff); const v128_t vzero = wasm_i16x8_const_splat(0); const v128_t vxmask0 = wasm_i32x4_extend_low_i16x8(vmask0); const v128_t vxmask1 = wasm_i32x4_extend_high_i16x8(vmask0); const v128_t vxmask2 = wasm_i32x4_extend_low_i16x8(vmask1); const v128_t vxmask3 = wasm_i32x4_extend_high_i16x8(vmask1); const v128_t vabsf0 = wasm_v128_bitselect(vnorm0, vdenorm0, vxmask0); const v128_t vsignf0 = wasm_v16x8_shuffle(vzero, vsign0, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t vabsf1 = wasm_v128_bitselect(vnorm1, vdenorm1, vxmask1); const v128_t vsignf1 = wasm_v16x8_shuffle(vzero, vsign0, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t vabsf2 = wasm_v128_bitselect(vnorm2, vdenorm2, vxmask2); const v128_t vsignf2 = wasm_v16x8_shuffle(vzero, vsign1, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t vabsf3 = wasm_v128_bitselect(vnorm3, vdenorm3, vxmask3); const v128_t vsignf3 = wasm_v16x8_shuffle(vzero, vsign1, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t vf0 = wasm_v128_or(vsignf0, vabsf0); const v128_t vf1 = wasm_v128_or(vsignf1, vabsf1); const v128_t vf2 = wasm_v128_or(vsignf2, vabsf2); const v128_t vf3 = wasm_v128_or(vsignf3, vabsf3); wasm_v128_store(output, vf0); wasm_v128_store(output + 4, vf1); wasm_v128_store(output + 8, vf2); wasm_v128_store(output + 12, vf3); output += 16; } for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { const v128_t vh = wasm_v128_load(i); i += 8; const v128_t vsign = wasm_v128_and(vh, vsign_mask); const v128_t vnonsign = wasm_v128_xor(vh, vsign); const v128_t vprenorm_lo = wasm_i16x8_shl(vnonsign, 13); const v128_t vprenorm_hi = wasm_i16x8_add(wasm_u16x8_shr(vnonsign, 3), vexp_offset); const v128_t vnorm_lo = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm_lo, vprenorm_hi, 0, 8, 1, 9, 2, 10, 3, 11), vexp_scale); const v128_t vnorm_hi = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm_lo, vprenorm_hi, 4, 12, 5, 13, 6, 14, 7, 15), vexp_scale); const v128_t vdenorm_lo = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign, vmagic_mask, 0, 8, 1, 9, 2, 10, 3, 11), vmagic_bias); const v128_t vdenorm_hi = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign, vmagic_mask, 4, 12, 5, 13, 6, 14, 7, 15), vmagic_bias); const v128_t vmask = wasm_i16x8_gt(vnonsign, vdenorm_cutoff); const v128_t vzero = wasm_i16x8_const_splat(0); const v128_t vxmask_lo = wasm_i32x4_extend_low_i16x8(vmask); const v128_t vxmask_hi = wasm_i32x4_extend_high_i16x8(vmask); const v128_t vabsf_lo = wasm_v128_bitselect(vnorm_lo, vdenorm_lo, vxmask_lo); const v128_t vsignf_lo = wasm_v16x8_shuffle(vzero, vsign, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t vabsf_hi = wasm_v128_bitselect(vnorm_hi, vdenorm_hi, vxmask_hi); const v128_t vsignf_hi = wasm_v16x8_shuffle(vzero, vsign, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t vf_lo = wasm_v128_or(vsignf_lo, vabsf_lo); const v128_t vf_hi = wasm_v128_or(vsignf_hi, vabsf_hi); wasm_v128_store(output, vf_lo); wasm_v128_store(output + 4, vf_hi); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); const v128_t vh = wasm_v128_load(i); const v128_t vsign = wasm_v128_and(vh, vsign_mask); const v128_t vnonsign = wasm_v128_xor(vh, vsign); const v128_t vprenorm_lo = wasm_i16x8_shl(vnonsign, 13); const v128_t vprenorm_hi = wasm_i16x8_add(wasm_u16x8_shr(vnonsign, 3), vexp_offset); const v128_t vnorm_lo = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm_lo, vprenorm_hi, 0, 8, 1, 9, 2, 10, 3, 11), vexp_scale); const v128_t vnorm_hi = wasm_f32x4_mul(wasm_v16x8_shuffle(vprenorm_lo, vprenorm_hi, 4, 12, 5, 13, 6, 14, 7, 15), vexp_scale); const v128_t vdenorm_lo = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign, vmagic_mask, 0, 8, 1, 9, 2, 10, 3, 11), vmagic_bias); const v128_t vdenorm_hi = wasm_f32x4_sub(wasm_v16x8_shuffle(vnonsign, vmagic_mask, 4, 12, 5, 13, 6, 14, 7, 15), vmagic_bias); const v128_t vmask = wasm_i16x8_gt(vnonsign, vdenorm_cutoff); const v128_t vzero = wasm_i16x8_const_splat(0); const v128_t vxmask_lo = wasm_i32x4_extend_low_i16x8(vmask); const v128_t vxmask_hi = wasm_i32x4_extend_high_i16x8(vmask); const v128_t vabsf_lo = wasm_v128_bitselect(vnorm_lo, vdenorm_lo, vxmask_lo); const v128_t vsignf_lo = wasm_v16x8_shuffle(vzero, vsign, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t vabsf_hi = wasm_v128_bitselect(vnorm_hi, vdenorm_hi, vxmask_hi); const v128_t vsignf_hi = wasm_v16x8_shuffle(vzero, vsign, 4, 12, 5, 13, 6, 14, 7, 15); v128_t vf = wasm_v128_or(vsignf_lo, vabsf_lo); if (batch & (4 * sizeof(uint16_t))) { wasm_v128_store(output, vf); output += 4; vf = wasm_v128_or(vsignf_hi, vabsf_hi); } if (batch & (2 * sizeof(uint16_t))) { wasm_v128_store64_lane(output, vf, 0); vf = wasm_v64x2_shuffle(vf, vf, 1, 1); output += 2; } if (batch & (1 * sizeof(uint16_t))) { wasm_v128_store32_lane(output, vf, 0); } } } void xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* output, uint32_t* index_ptr, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements <= 4); assert(channels != 0); float* index = (float*) index_ptr; do { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); if (pooling_elements < 2) { i1 = i0; } if (pooling_elements <= 2) { i2 = i0; } if (pooling_elements != 4) { i3 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; v128_t vmax = vi0; v128_t vidx = wasm_i32x4_const_splat(0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(1), vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(2), vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(3), vidx, vm3); wasm_v128_store(output, vmax); output += 4; wasm_v128_store(index, vidx); index += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); v128_t vmax = vi0; v128_t vidx = wasm_i32x4_const_splat(0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(1), vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(2), vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(3), vidx, vm3); if (c & 2) { wasm_v128_store64_lane(output, vmax, 0); wasm_v128_store64_lane(index, vidx, 0); vmax = wasm_v64x2_shuffle(vmax, vmax, 1, 1); vidx = wasm_v64x2_shuffle(vidx, vidx, 1, 1); output += 2; index += 2; } if (c & 1) { wasm_v128_store32_lane(output, vmax, 0); wasm_v128_store32_lane(index, vidx, 0); output += 1; index += 1; } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* accumulation_buffer, uint32_t* index_buffer, float* output, uint32_t* index, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements > 9); assert(channels != 0); do { { float* ab = accumulation_buffer; uint32_t* ib = index_buffer; const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; const float* i8 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; v128_t vmax = vi0; v128_t vidx = wasm_i32x4_const_splat(0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(1), vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(2), vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(3), vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(4), vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(5), vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(6), vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(7), vidx, vm7); const v128_t vm8 = wasm_f32x4_gt(vi8, vmax); vmax = wasm_v128_bitselect(vi8, vmax, vm8); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(8), vidx, vm8); wasm_v128_store(ab, vmax); ab += 4; wasm_v128_store(ib, vidx); ib += 4; } } const v128_t v1 = wasm_i32x4_const_splat(1); const v128_t v8 = wasm_i32x4_const_splat(8); v128_t vidx0 = wasm_i32x4_add(v1, v8); size_t k = pooling_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); float* ab = accumulation_buffer; uint32_t* ib = index_buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; v128_t vmax = wasm_v128_load(ab); v128_t vidx = wasm_v128_load(ib); const v128_t vm0 = wasm_f32x4_gt(vi0, vmax); vmax = wasm_v128_bitselect(vi0, vmax, vm0); vidx = wasm_v128_bitselect(vidx0, vidx, vm0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); const v128_t vidx1 = wasm_i32x4_add(vidx0, v1); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(vidx1, vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); const v128_t vidx2 = wasm_i32x4_add(vidx1, v1); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(vidx2, vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); const v128_t vidx3 = wasm_i32x4_add(vidx2, v1); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(vidx3, vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); const v128_t vidx4 = wasm_i32x4_add(vidx3, v1); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(vidx4, vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); const v128_t vidx5 = wasm_i32x4_add(vidx4, v1); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(vidx5, vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); const v128_t vidx6 = wasm_i32x4_add(vidx5, v1); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(vidx6, vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); const v128_t vidx7 = wasm_i32x4_add(vidx6, v1); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(vidx7, vidx, vm7); wasm_v128_store(ab, vmax); ab += 4; wasm_v128_store(ib, vidx); ib += 4; } vidx0 = wasm_i32x4_add(vidx0, v8); } float* o = output; float* i = (float*) index; { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k != 8) { i7 = i0; } size_t c = channels; float* ab = accumulation_buffer; uint32_t* ib = index_buffer; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; v128_t vmax = wasm_v128_load(ab); ab += 4; v128_t vidx = wasm_v128_load(ib); ib += 4; const v128_t vm0 = wasm_f32x4_gt(vi0, vmax); vmax = wasm_v128_bitselect(vi0, vmax, vm0); vidx = wasm_v128_bitselect(vidx0, vidx, vm0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); const v128_t vidx1 = wasm_i32x4_add(vidx0, v1); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(vidx1, vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); const v128_t vidx2 = wasm_i32x4_add(vidx1, v1); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(vidx2, vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); const v128_t vidx3 = wasm_i32x4_add(vidx2, v1); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(vidx3, vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); const v128_t vidx4 = wasm_i32x4_add(vidx3, v1); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(vidx4, vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); const v128_t vidx5 = wasm_i32x4_add(vidx4, v1); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(vidx5, vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); const v128_t vidx6 = wasm_i32x4_add(vidx5, v1); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(vidx6, vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); const v128_t vidx7 = wasm_i32x4_add(vidx6, v1); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(vidx7, vidx, vm7); wasm_v128_store(o, vmax); o += 4; wasm_v128_store(i, vidx); i += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); v128_t vmax = wasm_v128_load(ab); v128_t vidx = wasm_v128_load(ib); const v128_t vm0 = wasm_f32x4_gt(vi0, vmax); vmax = wasm_v128_bitselect(vi0, vmax, vm0); vidx = wasm_v128_bitselect(vidx0, vidx, vm0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); const v128_t vidx1 = wasm_i32x4_add(vidx0, v1); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(vidx1, vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); const v128_t vidx2 = wasm_i32x4_add(vidx1, v1); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(vidx2, vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); const v128_t vidx3 = wasm_i32x4_add(vidx2, v1); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(vidx3, vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); const v128_t vidx4 = wasm_i32x4_add(vidx3, v1); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(vidx4, vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); const v128_t vidx5 = wasm_i32x4_add(vidx4, v1); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(vidx5, vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); const v128_t vidx6 = wasm_i32x4_add(vidx5, v1); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(vidx6, vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); const v128_t vidx7 = wasm_i32x4_add(vidx6, v1); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(vidx7, vidx, vm7); if (c & 2) { wasm_v128_store64_lane(o, vmax, 0); wasm_v128_store64_lane(i, vidx, 0); vmax = wasm_v64x2_shuffle(vmax, vmax, 1, 1); vidx = wasm_v64x2_shuffle(vidx, vidx, 1, 1); o += 2; i += 2; } if (c & 1) { wasm_v128_store32_lane(o, vmax, 0); wasm_v128_store32_lane(i, vidx, 0); o += 1; i += 1; } } } output = (float*) ((uintptr_t) o + output_increment); index = (uint32_t*) i; } while (--output_pixels != 0); } void xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4( size_t output_pixels, size_t pooling_elements, size_t channels, const float** input, size_t input_offset, float* output, uint32_t* index_ptr, size_t input_increment, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(pooling_elements != 0); assert(pooling_elements <= 9); assert(channels != 0); float* index = (float*) index_ptr; do { const float* i0 = input[0]; const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); if (pooling_elements < 2) { i1 = i0; } if (pooling_elements <= 2) { i2 = i0; } if (pooling_elements < 4) { i3 = i0; } if (pooling_elements <= 4) { i4 = i0; } if (pooling_elements < 6) { i5 = i0; } if (pooling_elements <= 6) { i6 = i0; } if (pooling_elements < 8) { i7 = i0; } if (pooling_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; v128_t vmax = vi0; v128_t vidx = wasm_i32x4_const_splat(0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(1), vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(2), vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(3), vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(4), vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(5), vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(6), vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(7), vidx, vm7); const v128_t vm8 = wasm_f32x4_gt(vi8, vmax); vmax = wasm_v128_bitselect(vi8, vmax, vm8); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(8), vidx, vm8); wasm_v128_store(output, vmax); output += 4; wasm_v128_store(index, vidx); index += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); v128_t vmax = vi0; v128_t vidx = wasm_i32x4_const_splat(0); const v128_t vm1 = wasm_f32x4_gt(vi1, vmax); vmax = wasm_v128_bitselect(vi1, vmax, vm1); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(1), vidx, vm1); const v128_t vm2 = wasm_f32x4_gt(vi2, vmax); vmax = wasm_v128_bitselect(vi2, vmax, vm2); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(2), vidx, vm2); const v128_t vm3 = wasm_f32x4_gt(vi3, vmax); vmax = wasm_v128_bitselect(vi3, vmax, vm3); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(3), vidx, vm3); const v128_t vm4 = wasm_f32x4_gt(vi4, vmax); vmax = wasm_v128_bitselect(vi4, vmax, vm4); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(4), vidx, vm4); const v128_t vm5 = wasm_f32x4_gt(vi5, vmax); vmax = wasm_v128_bitselect(vi5, vmax, vm5); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(5), vidx, vm5); const v128_t vm6 = wasm_f32x4_gt(vi6, vmax); vmax = wasm_v128_bitselect(vi6, vmax, vm6); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(6), vidx, vm6); const v128_t vm7 = wasm_f32x4_gt(vi7, vmax); vmax = wasm_v128_bitselect(vi7, vmax, vm7); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(7), vidx, vm7); const v128_t vm8 = wasm_f32x4_gt(vi8, vmax); vmax = wasm_v128_bitselect(vi8, vmax, vm8); vidx = wasm_v128_bitselect(wasm_i32x4_const_splat(8), vidx, vm8); if (c & 2) { wasm_v128_store64_lane(output, vmax, 0); wasm_v128_store64_lane(index, vidx, 0); vmax = wasm_v64x2_shuffle(vmax, vmax, 1, 1); vidx = wasm_v64x2_shuffle(vidx, vidx, 1, 1); output += 2; index += 2; } if (c & 1) { wasm_v128_store32_lane(output, vmax, 0); wasm_v128_store32_lane(index, vidx, 0); output += 1; index += 1; } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); wasm_v128_store(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); wasm_v128_store(b, vsum); b += 4; } } { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } size_t c = channels; float* b = buffer; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); wasm_v128_store(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); wasm_v128_store(b, vsum); b += 4; } } { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } size_t c = channels; float* b = buffer; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } size_t c = channels; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum018 = wasm_f32x4_add(wasm_f32x4_add(vi0, vi1), vi8); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } size_t c = channels; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum018 = wasm_f32x4_add(wasm_f32x4_add(vi0, vi1), vi8); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2( size_t input_height, size_t input_width, size_t output_y_start, size_t output_y_end, const float* input, const float* zero, const float* weights, float* output, size_t input_padding_top, size_t output_channels, size_t output_height_stride, size_t output_channel_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_width != 0); assert(output_y_end > output_y_start); assert(input_padding_top <= 1); assert(output_channels != 0); const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float); const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float); const size_t output_width = (input_width + 1) / 2; const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float); // Adjustment for padding processed below const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top)); const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride); const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride); const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride); const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride); float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start); float* output1 = (float*) ((uintptr_t) output0 + output_height_stride); if XNN_UNPREDICTABLE(output_y_start < input_padding_top) { i0 = zero; } const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) { const size_t input_y2 = output_y * 2 + 2 - input_padding_top; const size_t input_y4 = input_y2 + 2; if XNN_UNPREDICTABLE(input_y2 >= input_height) { i2 = zero; } if XNN_UNPREDICTABLE(input_y4 > input_height) { i3 = zero; } if XNN_UNPREDICTABLE(input_y4 >= input_height) { i4 = zero; } if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) { output1 = output0; } const float* w = weights; size_t c = output_channels; float* o0c0 = output0; float* o1c0 = output1; float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride); float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride); float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride); float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride); float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride); float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride); do { if XNN_UNPREDICTABLE(c < 2) { o0c1 = o0c0; o1c1 = o1c0; } if XNN_UNPREDICTABLE(c <= 2) { o0c2 = o0c1; o1c2 = o1c1; } if XNN_UNPREDICTABLE(c < 4) { o0c3 = o0c2; o1c3 = o1c2; } // viMx0 = ( iM0c2, iM0c1, iM0c0, --- ) v128_t vi0x0 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0 = wasm_f32x4_const_splat(0.0f); v128_t vi4x0 = wasm_f32x4_const_splat(0.0f); size_t iw = input_width; for (; iw >= 4; iw -= 4) { v128_t vo0x0 = wasm_v128_load(w); v128_t vo1x0 = vo0x0; v128_t vo0x1 = vo0x0; v128_t vo1x1 = vo0x0; const v128_t vk00c0 = wasm_v128_load(w + 4); // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 ) const v128_t vi0x1 = wasm_v128_load(i0); i0 += 4; const v128_t vi1x1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2x1 = wasm_v128_load(i2); i2 += 4; const v128_t vi3x1 = wasm_v128_load(i3); i3 += 4; const v128_t vi4x1 = wasm_v128_load(i4); i4 += 4; vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi0x0, vi0x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); const v128_t vk10c0 = wasm_v128_load(w + 8); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi1x0, vi1x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi3x0, vi3x0, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 3, 3, 3, 3))); const v128_t vk20c0 = wasm_v128_load(w + 12); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi4x0, vi4x0, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 3, 3, 3, 3))); const v128_t vk00c1 = wasm_v128_load(w + 16); // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 ) const v128_t vi0x2 = wasm_v128_load(i0); i0 += 4; const v128_t vi1x2 = wasm_v128_load(i1); i1 += 4; const v128_t vi2x2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3x2 = wasm_v128_load(i3); i3 += 4; const v128_t vi4x2 = wasm_v128_load(i4); i4 += 4; vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi0x0, vi0x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); const v128_t vk10c1 = wasm_v128_load(w + 20); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x0, vi1x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x0, vi3x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))); const v128_t vk20c1 = wasm_v128_load(w + 24); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi4x0, vi4x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 0, 0, 0, 0))); const v128_t vk00c2 = wasm_v128_load(w + 28); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi0x0, vi0x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi0x2, vi0x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); const v128_t vk10c2 = wasm_v128_load(w + 32); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi1x0, vi1x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi3x0, vi3x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi1x2, vi1x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi3x2, vi3x2, 1, 1, 1, 1))); const v128_t vk20c2 = wasm_v128_load(w + 36); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi4x0, vi4x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi4x2, vi4x2, 1, 1, 1, 1))); const v128_t vk01c0 = wasm_v128_load(w + 40); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi0x2, vi0x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi2x2, vi2x2, 2, 2, 2, 2))); const v128_t vk11c0 = wasm_v128_load(w + 44); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi1x2, vi1x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi3x2, vi3x2, 2, 2, 2, 2))); const v128_t vk21c0 = wasm_v128_load(w + 48); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi2x2, vi2x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi4x2, vi4x2, 2, 2, 2, 2))); const v128_t vk01c1 = wasm_v128_load(w + 52); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi0x1, vi0x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))); const v128_t vk11c1 = wasm_v128_load(w + 56); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi1x1, vi1x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi3x1, vi3x1, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 3, 3, 3, 3))); const v128_t vk21c1 = wasm_v128_load(w + 60); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x1, vi4x1, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 3, 3, 3, 3))); const v128_t vk01c2 = wasm_v128_load(w + 64); // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 ) const v128_t vi0x3 = wasm_v128_load(i0); i0 += 4; const v128_t vi1x3 = wasm_v128_load(i1); i1 += 4; const v128_t vi2x3 = wasm_v128_load(i2); i2 += 4; const v128_t vi3x3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4x3 = wasm_v128_load(i4); i4 += 4; vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi0x1, vi0x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi0x3, vi0x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 0, 0, 0, 0))); const v128_t vk11c2 = wasm_v128_load(w + 68); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi1x1, vi1x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi3x1, vi3x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi1x3, vi1x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi3x3, vi3x3, 0, 0, 0, 0))); const v128_t vk21c2 = wasm_v128_load(w + 72); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x1, vi4x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 0, 0, 0, 0))); const v128_t vk02c0 = wasm_v128_load(w + 76); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi0x3, vi0x3, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi2x3, vi2x3, 1, 1, 1, 1))); const v128_t vk12c0 = wasm_v128_load(w + 80); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi1x3, vi1x3, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi3x3, vi3x3, 1, 1, 1, 1))); const v128_t vk22c0 = wasm_v128_load(w + 84); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi2x3, vi2x3, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi4x3, vi4x3, 1, 1, 1, 1))); const v128_t vk02c1 = wasm_v128_load(w + 88); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi0x3, vi0x3, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi2x3, vi2x3, 2, 2, 2, 2))); const v128_t vk12c1 = wasm_v128_load(w + 92); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi1x3, vi1x3, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi3x3, vi3x3, 2, 2, 2, 2))); const v128_t vk22c1 = wasm_v128_load(w + 96); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 0, 0, 0, 0))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi2x3, vi2x3, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi4x3, vi4x3, 2, 2, 2, 2))); const v128_t vk02c2 = wasm_v128_load(w + 100); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi0x2, vi0x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi0x3, vi0x3, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 3, 3, 3, 3))); const v128_t vk12c2 = wasm_v128_load(w + 104); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi1x2, vi1x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi3x2, vi3x2, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi1x3, vi1x3, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi3x3, vi3x3, 3, 3, 3, 3))); const v128_t vk22c2 = wasm_v128_load(w + 108); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi4x2, vi4x2, 1, 1, 1, 1))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 3, 3, 3, 3))); vi0x0 = vi0x3; vi1x0 = vi1x3; vi2x0 = vi2x3; vi3x0 = vi3x3; vi4x0 = vi4x3; vo0x0 = wasm_f32x4_pmax(vmin, vo0x0); vo1x0 = wasm_f32x4_pmax(vmin, vo1x0); vo0x1 = wasm_f32x4_pmax(vmin, vo0x1); vo1x1 = wasm_f32x4_pmax(vmin, vo1x1); vo0x0 = wasm_f32x4_pmin(vmax, vo0x0); vo1x0 = wasm_f32x4_pmin(vmax, vo1x0); vo0x1 = wasm_f32x4_pmin(vmax, vo0x1); vo1x1 = wasm_f32x4_pmin(vmax, vo1x1); const v128_t vo0c01 = wasm_v32x4_shuffle(vo0x0, vo0x1, 0, 4, 1, 5); const v128_t vo0c23 = wasm_v32x4_shuffle(vo0x0, vo0x1, 2, 6, 3, 7); const v128_t vo1c01 = wasm_v32x4_shuffle(vo1x0, vo1x1, 0, 4, 1, 5); const v128_t vo1c23 = wasm_v32x4_shuffle(vo1x0, vo1x1, 2, 6, 3, 7); // Always 2+ output width elements remaining wasm_v128_store64_lane(o1c0, vo1c01, 0); o1c0 += 2; wasm_v128_store64_lane(o1c1, vo1c01, 1); o1c1 += 2; wasm_v128_store64_lane(o1c2, vo1c23, 0); o1c2 += 2; wasm_v128_store64_lane(o1c3, vo1c23, 1); o1c3 += 2; wasm_v128_store64_lane(o0c0, vo0c01, 0); o0c0 += 2; wasm_v128_store64_lane(o0c1, vo0c01, 1); o0c1 += 2; wasm_v128_store64_lane(o0c2, vo0c23, 0); o0c2 += 2; wasm_v128_store64_lane(o0c3, vo0c23, 1); o0c3 += 2; } assert(iw < 4); if XNN_UNLIKELY(iw != 0) { v128_t vo0x0 = wasm_v128_load(w); v128_t vo1x0 = vo0x0; v128_t vo0x1 = vo0x0; v128_t vo1x1 = vo0x0; const v128_t vk00c0 = wasm_v128_load(w + 4); // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 ) v128_t vi0x1 = wasm_v128_load(i0); v128_t vi1x1 = wasm_v128_load(i1); v128_t vi2x1 = wasm_v128_load(i2); v128_t vi3x1 = wasm_v128_load(i3); v128_t vi4x1 = wasm_v128_load(i4); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi0x0, vi0x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); } const v128_t vk10c0 = wasm_v128_load(w + 8); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi1x0, vi1x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi3x0, vi3x0, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 3, 3, 3, 3))); } const v128_t vk20c0 = wasm_v128_load(w + 12); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi4x0, vi4x0, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 3, 3, 3, 3))); } const v128_t vk00c1 = wasm_v128_load(w + 16); v128_t vi0x2 = wasm_f32x4_const_splat(0.0f); v128_t vi1x2 = wasm_f32x4_const_splat(0.0f); v128_t vi2x2 = wasm_f32x4_const_splat(0.0f); v128_t vi3x2 = wasm_f32x4_const_splat(0.0f); v128_t vi4x2 = wasm_f32x4_const_splat(0.0f); if (iw >= 2) { // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 ) vi0x2 = wasm_v128_load(i0 + 4); vi1x2 = wasm_v128_load(i1 + 4); vi2x2 = wasm_v128_load(i2 + 4); vi3x2 = wasm_v128_load(i3 + 4); vi4x2 = wasm_v128_load(i4 + 4); } vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi0x0, vi0x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); const v128_t vk10c1 = wasm_v128_load(w + 20); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x0, vi1x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x0, vi3x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))); const v128_t vk20c1 = wasm_v128_load(w + 24); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi4x0, vi4x0, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 0, 0, 0, 0))); const v128_t vk00c2 = wasm_v128_load(w + 28); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi0x0, vi0x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi0x2, vi0x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); const v128_t vk10c2 = wasm_v128_load(w + 32); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi1x0, vi1x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi3x0, vi3x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi1x2, vi1x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi3x2, vi3x2, 1, 1, 1, 1))); const v128_t vk20c2 = wasm_v128_load(w + 36); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi4x0, vi4x0, 3, 3, 3, 3))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi4x2, vi4x2, 1, 1, 1, 1))); const v128_t vk01c0 = wasm_v128_load(w + 40); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi0x2, vi0x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi2x2, vi2x2, 2, 2, 2, 2))); } const v128_t vk11c0 = wasm_v128_load(w + 44); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 0, 0, 0, 0))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi1x2, vi1x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c0, wasm_v32x4_shuffle(vi3x2, vi3x2, 2, 2, 2, 2))); } const v128_t vk21c0 = wasm_v128_load(w + 48); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 0, 0, 0, 0))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi2x2, vi2x2, 2, 2, 2, 2))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi4x2, vi4x2, 2, 2, 2, 2))); } const v128_t vk01c1 = wasm_v128_load(w + 52); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi0x1, vi0x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))); } const v128_t vk11c1 = wasm_v128_load(w + 56); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi1x1, vi1x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi3x1, vi3x1, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 3, 3, 3, 3))); } const v128_t vk21c1 = wasm_v128_load(w + 60); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x1, vi4x1, 1, 1, 1, 1))); if (iw > 2) { vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 3, 3, 3, 3))); } const v128_t vk01c2 = wasm_v128_load(w + 64); v128_t vi0x3 = wasm_f32x4_const_splat(0.0f); v128_t vi1x3 = wasm_f32x4_const_splat(0.0f); v128_t vi2x3 = wasm_f32x4_const_splat(0.0f); v128_t vi3x3 = wasm_f32x4_const_splat(0.0f); v128_t vi4x3 = wasm_f32x4_const_splat(0.0f); if (iw > 2) { // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 ) vi0x3 = wasm_v128_load32_splat(i0 + 8); vi1x3 = wasm_v128_load32_splat(i1 + 8); vi2x3 = wasm_v128_load32_splat(i2 + 8); vi3x3 = wasm_v128_load32_splat(i3 + 8); vi4x3 = wasm_v128_load32_splat(i4 + 8); } vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi0x1, vi0x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi0x3, vi0x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 0, 0, 0, 0))); const v128_t vk11c2 = wasm_v128_load(w + 68); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi1x1, vi1x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi3x1, vi3x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi1x3, vi1x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk11c2, wasm_v32x4_shuffle(vi3x3, vi3x3, 0, 0, 0, 0))); const v128_t vk21c2 = wasm_v128_load(w + 72); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x1, vi4x1, 2, 2, 2, 2))); vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi2x3, vi2x3, 0, 0, 0, 0))); vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 0, 0, 0, 0))); if (iw >= 2) { const v128_t vk02c0 = wasm_v128_load(w + 76); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); const v128_t vk12c0 = wasm_v128_load(w + 80); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c0, wasm_v32x4_shuffle(vi3x1, vi3x1, 3, 3, 3, 3))); const v128_t vk22c0 = wasm_v128_load(w + 84); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi4x1, vi4x1, 3, 3, 3, 3))); const v128_t vk02c1 = wasm_v128_load(w + 88); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); const v128_t vk12c1 = wasm_v128_load(w + 92); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))); const v128_t vk22c1 = wasm_v128_load(w + 96); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 0, 0, 0, 0))); const v128_t vk02c2 = wasm_v128_load(w + 100); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi0x2, vi0x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); const v128_t vk12c2 = wasm_v128_load(w + 104); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi1x2, vi1x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk12c2, wasm_v32x4_shuffle(vi3x2, vi3x2, 1, 1, 1, 1))); const v128_t vk22c2 = wasm_v128_load(w + 108); vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))); vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi4x2, vi4x2, 1, 1, 1, 1))); } vo0x0 = wasm_f32x4_pmax(vmin, vo0x0); vo1x0 = wasm_f32x4_pmax(vmin, vo1x0); vo0x1 = wasm_f32x4_pmax(vmin, vo0x1); vo1x1 = wasm_f32x4_pmax(vmin, vo1x1); vo0x0 = wasm_f32x4_pmin(vmax, vo0x0); vo1x0 = wasm_f32x4_pmin(vmax, vo1x0); vo0x1 = wasm_f32x4_pmin(vmax, vo0x1); vo1x1 = wasm_f32x4_pmin(vmax, vo1x1); if (iw == 3) { // Exactly 2 output width elements remaining const v128_t vo0c01 = wasm_v32x4_shuffle(vo0x0, vo0x1, 0, 4, 1, 5); const v128_t vo0c23 = wasm_v32x4_shuffle(vo0x0, vo0x1, 2, 6, 3, 7); const v128_t vo1c01 = wasm_v32x4_shuffle(vo1x0, vo1x1, 0, 4, 1, 5); const v128_t vo1c23 = wasm_v32x4_shuffle(vo1x0, vo1x1, 2, 6, 3, 7); wasm_v128_store64_lane(o1c0, vo1c01, 0); o1c0 += 2; wasm_v128_store64_lane(o1c1, vo1c01, 1); o1c1 += 2; wasm_v128_store64_lane(o1c2, vo1c23, 0); o1c2 += 2; wasm_v128_store64_lane(o1c3, vo1c23, 1); o1c3 += 2; wasm_v128_store64_lane(o0c0, vo0c01, 0); o0c0 += 2; wasm_v128_store64_lane(o0c1, vo0c01, 1); o0c1 += 2; wasm_v128_store64_lane(o0c2, vo0c23, 0); o0c2 += 2; wasm_v128_store64_lane(o0c3, vo0c23, 1); o0c3 += 2; } else { // Exactly 1 output width element remaining wasm_v128_store32_lane(o1c0, vo1x0, 0); o1c0 += 1; wasm_v128_store32_lane(o1c1, vo1x0, 1); o1c1 += 1; wasm_v128_store32_lane(o1c2, vo1x0, 2); o1c2 += 1; wasm_v128_store32_lane(o1c3, vo1x0, 3); o1c3 += 1; wasm_v128_store32_lane(o0c0, vo0x0, 0); o0c0 += 1; wasm_v128_store32_lane(o0c1, vo0x0, 1); o0c1 += 1; wasm_v128_store32_lane(o0c2, vo0x0, 2); o0c2 += 1; wasm_v128_store32_lane(o0c3, vo0x0, 3); o0c3 += 1; } } // Move output pointers back to the position of the first pixel in a row, // and forward to the next block of output channels. o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment); o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment); o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment); o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment); o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment); o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment); o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment); o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment); // Revert input pointers to the position of the first pixel in a row i0 = (const float*) ((uintptr_t) i0 - input_width_increment); i1 = (const float*) ((uintptr_t) i1 - input_width_increment); i2 = (const float*) ((uintptr_t) i2 - input_width_increment); i3 = (const float*) ((uintptr_t) i3 - input_width_increment); i4 = (const float*) ((uintptr_t) i4 - input_width_increment); // Move to the block of weights for the next 4 output channels w += 112; c = doz(c, 4); } while (c != 0); // Move output pointers forward to the next two rows output0 = (float*) ((uintptr_t) output1 + output_height_stride); output1 = (float*) ((uintptr_t) output0 + output_height_stride); // Move input pointers forward to the next four rows i0 = i4; i1 = (const float*) ((uintptr_t) i0 + input_height_stride); i2 = (const float*) ((uintptr_t) i1 + input_height_stride); i3 = (const float*) ((uintptr_t) i2 + input_height_stride); i4 = (const float*) ((uintptr_t) i3 + input_height_stride); } } void xnn_f32_dwconv_minmax_ukernel_25p4c__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const float* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const float*) ((uintptr_t) i9 + input_offset); } const float* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const float*) ((uintptr_t) i10 + input_offset); } const float* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const float*) ((uintptr_t) i11 + input_offset); } const float* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const float*) ((uintptr_t) i12 + input_offset); } const float* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const float*) ((uintptr_t) i13 + input_offset); } const float* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const float*) ((uintptr_t) i14 + input_offset); } const float* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const float*) ((uintptr_t) i15 + input_offset); } const float* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const float*) ((uintptr_t) i16 + input_offset); } const float* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const float*) ((uintptr_t) i17 + input_offset); } const float* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const float*) ((uintptr_t) i18 + input_offset); } const float* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const float*) ((uintptr_t) i19 + input_offset); } const float* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const float*) ((uintptr_t) i20 + input_offset); } const float* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const float*) ((uintptr_t) i21 + input_offset); } const float* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const float*) ((uintptr_t) i22 + input_offset); } const float* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const float*) ((uintptr_t) i23 + input_offset); } const float* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const float*) ((uintptr_t) i24 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); const v128_t vi9x0123 = wasm_v128_load(i9); i9 += 4; const v128_t vk9x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi9x0123, vk9x0123), vacc0123p0); const v128_t vi10x0123 = wasm_v128_load(i10); i10 += 4; const v128_t vk10x0123 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi10x0123, vk10x0123), vacc0123p0); const v128_t vi11x0123 = wasm_v128_load(i11); i11 += 4; const v128_t vk11x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi11x0123, vk11x0123), vacc0123p0); const v128_t vi12x0123 = wasm_v128_load(i12); i12 += 4; const v128_t vk12x0123 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi12x0123, vk12x0123), vacc0123p0); const v128_t vi13x0123 = wasm_v128_load(i13); i13 += 4; const v128_t vk13x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi13x0123, vk13x0123), vacc0123p0); const v128_t vi14x0123 = wasm_v128_load(i14); i14 += 4; const v128_t vk14x0123 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi14x0123, vk14x0123), vacc0123p0); const v128_t vi15x0123 = wasm_v128_load(i15); i15 += 4; const v128_t vk15x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi15x0123, vk15x0123), vacc0123p0); const v128_t vi16x0123 = wasm_v128_load(i16); i16 += 4; const v128_t vk16x0123 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi16x0123, vk16x0123), vacc0123p0); const v128_t vi17x0123 = wasm_v128_load(i17); i17 += 4; const v128_t vk17x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi17x0123, vk17x0123), vacc0123p0); const v128_t vi18x0123 = wasm_v128_load(i18); i18 += 4; const v128_t vk18x0123 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi18x0123, vk18x0123), vacc0123p0); const v128_t vi19x0123 = wasm_v128_load(i19); i19 += 4; const v128_t vk19x0123 = wasm_v128_load(w + 80); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi19x0123, vk19x0123), vacc0123p0); const v128_t vi20x0123 = wasm_v128_load(i20); i20 += 4; const v128_t vk20x0123 = wasm_v128_load(w + 84); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi20x0123, vk20x0123), vacc0123p0); const v128_t vi21x0123 = wasm_v128_load(i21); i21 += 4; const v128_t vk21x0123 = wasm_v128_load(w + 88); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi21x0123, vk21x0123), vacc0123p0); const v128_t vi22x0123 = wasm_v128_load(i22); i22 += 4; const v128_t vk22x0123 = wasm_v128_load(w + 92); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi22x0123, vk22x0123), vacc0123p0); const v128_t vi23x0123 = wasm_v128_load(i23); i23 += 4; const v128_t vk23x0123 = wasm_v128_load(w + 96); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi23x0123, vk23x0123), vacc0123p0); const v128_t vi24x0123 = wasm_v128_load(i24); i24 += 4; const v128_t vk24x0123 = wasm_v128_load(w + 100); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi24x0123, vk24x0123), vacc0123p0); w += 104; v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); const v128_t vi9x0123 = wasm_v128_load(i9); const v128_t vk9x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi9x0123, vk9x0123), vacc0123p0); const v128_t vi10x0123 = wasm_v128_load(i10); const v128_t vk10x0123 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi10x0123, vk10x0123), vacc0123p0); const v128_t vi11x0123 = wasm_v128_load(i11); const v128_t vk11x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi11x0123, vk11x0123), vacc0123p0); const v128_t vi12x0123 = wasm_v128_load(i12); const v128_t vk12x0123 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi12x0123, vk12x0123), vacc0123p0); const v128_t vi13x0123 = wasm_v128_load(i13); const v128_t vk13x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi13x0123, vk13x0123), vacc0123p0); const v128_t vi14x0123 = wasm_v128_load(i14); const v128_t vk14x0123 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi14x0123, vk14x0123), vacc0123p0); const v128_t vi15x0123 = wasm_v128_load(i15); const v128_t vk15x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi15x0123, vk15x0123), vacc0123p0); const v128_t vi16x0123 = wasm_v128_load(i16); const v128_t vk16x0123 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi16x0123, vk16x0123), vacc0123p0); const v128_t vi17x0123 = wasm_v128_load(i17); const v128_t vk17x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi17x0123, vk17x0123), vacc0123p0); const v128_t vi18x0123 = wasm_v128_load(i18); const v128_t vk18x0123 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi18x0123, vk18x0123), vacc0123p0); const v128_t vi19x0123 = wasm_v128_load(i19); const v128_t vk19x0123 = wasm_v128_load(w + 80); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi19x0123, vk19x0123), vacc0123p0); const v128_t vi20x0123 = wasm_v128_load(i20); const v128_t vk20x0123 = wasm_v128_load(w + 84); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi20x0123, vk20x0123), vacc0123p0); const v128_t vi21x0123 = wasm_v128_load(i21); const v128_t vk21x0123 = wasm_v128_load(w + 88); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi21x0123, vk21x0123), vacc0123p0); const v128_t vi22x0123 = wasm_v128_load(i22); const v128_t vk22x0123 = wasm_v128_load(w + 92); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi22x0123, vk22x0123), vacc0123p0); const v128_t vi23x0123 = wasm_v128_load(i23); const v128_t vk23x0123 = wasm_v128_load(w + 96); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi23x0123, vk23x0123), vacc0123p0); const v128_t vi24x0123 = wasm_v128_load(i24); const v128_t vk24x0123 = wasm_v128_load(w + 100); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi24x0123, vk24x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_25p4c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const float* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const float*) ((uintptr_t) i9 + input_offset); } const float* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const float*) ((uintptr_t) i10 + input_offset); } const float* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const float*) ((uintptr_t) i11 + input_offset); } const float* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const float*) ((uintptr_t) i12 + input_offset); } const float* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const float*) ((uintptr_t) i13 + input_offset); } const float* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const float*) ((uintptr_t) i14 + input_offset); } const float* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const float*) ((uintptr_t) i15 + input_offset); } const float* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const float*) ((uintptr_t) i16 + input_offset); } const float* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const float*) ((uintptr_t) i17 + input_offset); } const float* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const float*) ((uintptr_t) i18 + input_offset); } const float* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const float*) ((uintptr_t) i19 + input_offset); } const float* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const float*) ((uintptr_t) i20 + input_offset); } const float* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const float*) ((uintptr_t) i21 + input_offset); } const float* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const float*) ((uintptr_t) i22 + input_offset); } const float* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const float*) ((uintptr_t) i23 + input_offset); } const float* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const float*) ((uintptr_t) i24 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); const v128_t vi9x0123 = wasm_v128_load(i9); i9 += 4; const v128_t vk9x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi9x0123, vk9x0123), vacc0123p0); const v128_t vi10x0123 = wasm_v128_load(i10); i10 += 4; const v128_t vk10x0123 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi10x0123, vk10x0123), vacc0123p0); const v128_t vi11x0123 = wasm_v128_load(i11); i11 += 4; const v128_t vk11x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi11x0123, vk11x0123), vacc0123p0); const v128_t vi12x0123 = wasm_v128_load(i12); i12 += 4; const v128_t vk12x0123 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi12x0123, vk12x0123), vacc0123p0); const v128_t vi13x0123 = wasm_v128_load(i13); i13 += 4; const v128_t vk13x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi13x0123, vk13x0123), vacc0123p0); const v128_t vi14x0123 = wasm_v128_load(i14); i14 += 4; const v128_t vk14x0123 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi14x0123, vk14x0123), vacc0123p0); const v128_t vi15x0123 = wasm_v128_load(i15); i15 += 4; const v128_t vk15x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi15x0123, vk15x0123), vacc0123p0); const v128_t vi16x0123 = wasm_v128_load(i16); i16 += 4; const v128_t vk16x0123 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi16x0123, vk16x0123), vacc0123p0); const v128_t vi17x0123 = wasm_v128_load(i17); i17 += 4; const v128_t vk17x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi17x0123, vk17x0123), vacc0123p0); const v128_t vi18x0123 = wasm_v128_load(i18); i18 += 4; const v128_t vk18x0123 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi18x0123, vk18x0123), vacc0123p0); const v128_t vi19x0123 = wasm_v128_load(i19); i19 += 4; const v128_t vk19x0123 = wasm_v128_load(w + 80); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi19x0123, vk19x0123), vacc0123p0); const v128_t vi20x0123 = wasm_v128_load(i20); i20 += 4; const v128_t vk20x0123 = wasm_v128_load(w + 84); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi20x0123, vk20x0123), vacc0123p0); const v128_t vi21x0123 = wasm_v128_load(i21); i21 += 4; const v128_t vk21x0123 = wasm_v128_load(w + 88); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi21x0123, vk21x0123), vacc0123p0); const v128_t vi22x0123 = wasm_v128_load(i22); i22 += 4; const v128_t vk22x0123 = wasm_v128_load(w + 92); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi22x0123, vk22x0123), vacc0123p0); const v128_t vi23x0123 = wasm_v128_load(i23); i23 += 4; const v128_t vk23x0123 = wasm_v128_load(w + 96); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi23x0123, vk23x0123), vacc0123p0); const v128_t vi24x0123 = wasm_v128_load(i24); i24 += 4; const v128_t vk24x0123 = wasm_v128_load(w + 100); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi24x0123, vk24x0123), vacc0123p0); w += 104; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); const v128_t vi9x0123 = wasm_v128_load(i9); const v128_t vk9x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi9x0123, vk9x0123), vacc0123p0); const v128_t vi10x0123 = wasm_v128_load(i10); const v128_t vk10x0123 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi10x0123, vk10x0123), vacc0123p0); const v128_t vi11x0123 = wasm_v128_load(i11); const v128_t vk11x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi11x0123, vk11x0123), vacc0123p0); const v128_t vi12x0123 = wasm_v128_load(i12); const v128_t vk12x0123 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi12x0123, vk12x0123), vacc0123p0); const v128_t vi13x0123 = wasm_v128_load(i13); const v128_t vk13x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi13x0123, vk13x0123), vacc0123p0); const v128_t vi14x0123 = wasm_v128_load(i14); const v128_t vk14x0123 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi14x0123, vk14x0123), vacc0123p0); const v128_t vi15x0123 = wasm_v128_load(i15); const v128_t vk15x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi15x0123, vk15x0123), vacc0123p0); const v128_t vi16x0123 = wasm_v128_load(i16); const v128_t vk16x0123 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi16x0123, vk16x0123), vacc0123p0); const v128_t vi17x0123 = wasm_v128_load(i17); const v128_t vk17x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi17x0123, vk17x0123), vacc0123p0); const v128_t vi18x0123 = wasm_v128_load(i18); const v128_t vk18x0123 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi18x0123, vk18x0123), vacc0123p0); const v128_t vi19x0123 = wasm_v128_load(i19); const v128_t vk19x0123 = wasm_v128_load(w + 80); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi19x0123, vk19x0123), vacc0123p0); const v128_t vi20x0123 = wasm_v128_load(i20); const v128_t vk20x0123 = wasm_v128_load(w + 84); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi20x0123, vk20x0123), vacc0123p0); const v128_t vi21x0123 = wasm_v128_load(i21); const v128_t vk21x0123 = wasm_v128_load(w + 88); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi21x0123, vk21x0123), vacc0123p0); const v128_t vi22x0123 = wasm_v128_load(i22); const v128_t vk22x0123 = wasm_v128_load(w + 92); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi22x0123, vk22x0123), vacc0123p0); const v128_t vi23x0123 = wasm_v128_load(i23); const v128_t vk23x0123 = wasm_v128_load(w + 96); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi23x0123, vk23x0123), vacc0123p0); const v128_t vi24x0123 = wasm_v128_load(i24); const v128_t vk24x0123 = wasm_v128_load(w + 100); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi24x0123, vk24x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3f3m3l4c4s4r__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 3); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* w = weights; // First pass to process 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; // Process c channels and write to buffer. size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 16; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 3 inputs in each iteration. for (size_t ks = kernel_size - 3; ks > 3; ks -= 3) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } size_t c = channels; for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; v128_t vacc0 = wasm_f32x4_max(vacc0p0, vmin); vacc0 = wasm_f32x4_min(vacc0, vmax); wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); v128_t vacc0 = wasm_f32x4_max(vacc0p0, vmin); vacc0 = wasm_f32x4_min(vacc0, vmax); if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_3f3m3l4c4s4r__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 3); do { const float* w = weights; // First pass to process 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; // Process c channels and write to buffer. size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 16; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 3 inputs in each iteration. for (size_t ks = kernel_size - 3; ks > 3; ks -= 3) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } size_t c = channels; for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; const v128_t vacc0 = vacc0p0; wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); v128_t vacc0 = vacc0p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3f3m3l8c4s4r__wasmsimd_x86( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 3); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* w = weights; // First pass to process 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; // Process c channels and write to buffer. size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 32; wasm_v128_store(b, vacc0123p0); wasm_v128_store(b + 4, vacc4567p0); b += 8; } if (c != 0) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 16; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 3 inputs in each iteration. for (size_t ks = kernel_size - 3; ks > 3; ks -= 3) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(b); v128_t vacc4567p0 = wasm_v128_load(b + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w); const v128_t vk0x4567 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 8); const v128_t vk1x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 16); const v128_t vk2x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 24; wasm_v128_store(b, vacc0123p0); wasm_v128_store(b + 4, vacc4567p0); b += 8; } if (c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } size_t c = channels; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(b); v128_t vacc4567p0 = wasm_v128_load(b + 4); b += 8; const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; v128_t vk0x0123 = wasm_v128_load(w); v128_t vk0x4567 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; v128_t vk1x0123 = wasm_v128_load(w + 8); v128_t vk1x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; v128_t vk2x0123 = wasm_v128_load(w + 16); v128_t vk2x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 24; v128_t vacc0123 = wasm_f32x4_pmax(vacc0123p0, vmin); v128_t vacc4567 = wasm_f32x4_pmax(vacc4567p0, vmin); vacc0123 = wasm_f32x4_pmin(vacc0123, vmax); vacc4567 = wasm_f32x4_pmin(vacc4567, vmax); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; v128_t vacc0 = wasm_f32x4_pmax(vacc0p0, vmin); vacc0 = wasm_f32x4_pmin(vacc0, vmax); wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); v128_t vacc0 = wasm_f32x4_pmax(vacc0p0, vmin); vacc0 = wasm_f32x4_pmin(vacc0, vmax); if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_3f3m3l8c4s4r__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 3); do { const float* w = weights; // First pass to process 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; // Process c channels and write to buffer. size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 32; wasm_v128_store(b, vacc0123p0); wasm_v128_store(b + 4, vacc4567p0); b += 8; } if (c != 0) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 16; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 3 inputs in each iteration. for (size_t ks = kernel_size - 3; ks > 3; ks -= 3) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input += 3; size_t c = round_up_po2(channels, 4); for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(b); v128_t vacc4567p0 = wasm_v128_load(b + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w); const v128_t vk0x4567 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 8); const v128_t vk1x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 16); const v128_t vk2x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 24; wasm_v128_store(b, vacc0123p0); wasm_v128_store(b + 4, vacc4567p0); b += 8; } if (c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 3 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } size_t c = channels; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(b); v128_t vacc4567p0 = wasm_v128_load(b + 4); b += 8; const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; v128_t vk0x0123 = wasm_v128_load(w); v128_t vk0x4567 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; v128_t vk1x0123 = wasm_v128_load(w + 8); v128_t vk1x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; v128_t vk2x0123 = wasm_v128_load(w + 16); v128_t vk2x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 24; const v128_t vacc0123 = vacc0123p0; const v128_t vacc4567 = vacc4567p0; wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); w += 12; const v128_t vacc0 = vacc0p0; wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); v128_t vacc0 = vacc0p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3p4c__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); w += 16; v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_3p4c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); w += 16; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_3p8c__wasmsimd_x86( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 32; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); v128_t vacc4567 = wasm_f32x4_pmax(vmin, vacc4567p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); vacc4567 = wasm_f32x4_pmin(vmax, vacc4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); w += 4; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_3p8c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); w += 32; const v128_t vacc0123 = vacc0123p0; const v128_t vacc4567 = vacc4567p0; wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); w += 4; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_4p4c__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); w += 20; v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_4p4c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); w += 20; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_4p8c__wasmsimd_x86( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vi3x4567 = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vk3x0123 = wasm_v128_load(w + 32); const v128_t vk3x4567 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vk3x4567), vacc4567p0); w += 40; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); v128_t vacc4567 = wasm_f32x4_pmax(vmin, vacc4567p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); vacc4567 = wasm_f32x4_pmin(vmax, vacc4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); w += 4; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_4p8c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vi3x4567 = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vk3x0123 = wasm_v128_load(w + 32); const v128_t vk3x4567 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vk3x4567), vacc4567p0); w += 40; const v128_t vacc0123 = vacc0123p0; const v128_t vacc4567 = vacc4567p0; wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); w += 4; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_5f5m5l4c4s4r__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 5); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* w = weights; // First pass to process 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; // Process c channels and write to buffer. size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 24; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 5 inputs in each iteration. for (size_t ks = kernel_size - 5; ks > 5; ks -= 5) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 20; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } size_t c = channels; for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 20; v128_t vacc0 = wasm_f32x4_max(vacc0p0, vmin); vacc0 = wasm_f32x4_min(vacc0, vmax); wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); v128_t vacc0 = wasm_f32x4_max(vacc0p0, vmin); vacc0 = wasm_f32x4_min(vacc0, vmax); if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_5f5m5l4c4s4r__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, size_t kernel_size, float* buffer, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); assert(kernel_size > 5); do { const float* w = weights; // First pass to process 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; // Process c channels and write to buffer. size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 24; wasm_v128_store(b, vacc0p0); b += 4; } } // Middle pass to process 5 inputs in each iteration. for (size_t ks = kernel_size - 5; ks > 5; ks -= 5) { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } input += 5; size_t c = 0; for (; c < channels; c += 4) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 20; wasm_v128_store(b, vacc0p0); b += 4; } } // Last pass to process up to 5 inputs. { float* b = buffer; const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } size_t c = channels; for (; c >= 4; c -= 4) { v128_t vacc0p0 = wasm_v128_load(b); b += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); w += 20; const v128_t vacc0 = vacc0p0; wasm_v128_store(output, vacc0); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0p0 = wasm_v128_load(b); const v128_t vi0x0123 = wasm_v128_load(i0); v128_t vk0x0123 = wasm_v128_load(w); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0p0); const v128_t vi1x0123 = wasm_v128_load(i1); v128_t vk1x0123 = wasm_v128_load(w + 4); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0p0); const v128_t vi2x0123 = wasm_v128_load(i2); v128_t vk2x0123 = wasm_v128_load(w + 8); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0p0); const v128_t vi3x0123 = wasm_v128_load(i3); v128_t vk3x0123 = wasm_v128_load(w + 12); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0p0); const v128_t vi4x0123 = wasm_v128_load(i4); v128_t vk4x0123 = wasm_v128_load(w + 16); vacc0p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0p0); v128_t vacc0 = vacc0p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0, 0); output += 1; } } } input = (const float**) ((uintptr_t) input + input_stride); output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_9p4c__wasmsimd_arm( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); w += 40; v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_max(vmin, vacc0123p0); vacc0123 = wasm_f32x4_min(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_9p4c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); w += 40; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 4); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_minmax_ukernel_9p8c__wasmsimd_x86( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vi3x4567 = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vk3x0123 = wasm_v128_load(w + 32); const v128_t vk3x4567 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vk3x4567), vacc4567p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vi4x4567 = wasm_v128_load(i4 + 4); i4 += 8; const v128_t vk4x0123 = wasm_v128_load(w + 40); const v128_t vk4x4567 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x4567, vk4x4567), vacc4567p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vi5x4567 = wasm_v128_load(i5 + 4); i5 += 8; const v128_t vk5x0123 = wasm_v128_load(w + 48); const v128_t vk5x4567 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x4567, vk5x4567), vacc4567p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vi6x4567 = wasm_v128_load(i6 + 4); i6 += 8; const v128_t vk6x0123 = wasm_v128_load(w + 56); const v128_t vk6x4567 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x4567, vk6x4567), vacc4567p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vi7x4567 = wasm_v128_load(i7 + 4); i7 += 8; const v128_t vk7x0123 = wasm_v128_load(w + 64); const v128_t vk7x4567 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x4567, vk7x4567), vacc4567p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vi8x4567 = wasm_v128_load(i8 + 4); i8 += 8; const v128_t vk8x0123 = wasm_v128_load(w + 72); const v128_t vk8x4567 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x4567, vk8x4567), vacc4567p0); w += 80; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); v128_t vacc4567 = wasm_f32x4_pmax(vmin, vacc4567p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); vacc4567 = wasm_f32x4_pmin(vmax, vacc4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); w += 4; v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); v128_t vacc0123 = wasm_f32x4_pmax(vmin, vacc0123p0); vacc0123 = wasm_f32x4_pmin(vmax, vacc0123); if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv_ukernel_9p8c__wasmsimd( size_t channels, size_t output_width, const float** input, const float* weights, float* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const float* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } input = (const float**) ((uintptr_t) input + input_stride); size_t c = channels; const float* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123p0 = wasm_v128_load(w); v128_t vacc4567p0 = wasm_v128_load(w + 4); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vk0x0123 = wasm_v128_load(w + 8); const v128_t vk0x4567 = wasm_v128_load(w + 12); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vk0x4567), vacc4567p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vk1x0123 = wasm_v128_load(w + 16); const v128_t vk1x4567 = wasm_v128_load(w + 20); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vk1x4567), vacc4567p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vi2x4567 = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vk2x0123 = wasm_v128_load(w + 24); const v128_t vk2x4567 = wasm_v128_load(w + 28); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x4567, vk2x4567), vacc4567p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vi3x4567 = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vk3x0123 = wasm_v128_load(w + 32); const v128_t vk3x4567 = wasm_v128_load(w + 36); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x4567, vk3x4567), vacc4567p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vi4x4567 = wasm_v128_load(i4 + 4); i4 += 8; const v128_t vk4x0123 = wasm_v128_load(w + 40); const v128_t vk4x4567 = wasm_v128_load(w + 44); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x4567, vk4x4567), vacc4567p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vi5x4567 = wasm_v128_load(i5 + 4); i5 += 8; const v128_t vk5x0123 = wasm_v128_load(w + 48); const v128_t vk5x4567 = wasm_v128_load(w + 52); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x4567, vk5x4567), vacc4567p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vi6x4567 = wasm_v128_load(i6 + 4); i6 += 8; const v128_t vk6x0123 = wasm_v128_load(w + 56); const v128_t vk6x4567 = wasm_v128_load(w + 60); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x4567, vk6x4567), vacc4567p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vi7x4567 = wasm_v128_load(i7 + 4); i7 += 8; const v128_t vk7x0123 = wasm_v128_load(w + 64); const v128_t vk7x4567 = wasm_v128_load(w + 68); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x4567, vk7x4567), vacc4567p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vi8x4567 = wasm_v128_load(i8 + 4); i8 += 8; const v128_t vk8x0123 = wasm_v128_load(w + 72); const v128_t vk8x4567 = wasm_v128_load(w + 76); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); vacc4567p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x4567, vk8x4567), vacc4567p0); w += 80; const v128_t vacc0123 = vacc0123p0; const v128_t vacc4567 = vacc4567p0; wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; c >= 4; c -= 4) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); i2 += 4; const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); i3 += 4; const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); i4 += 4; const v128_t vk4x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); i5 += 4; const v128_t vk5x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); i6 += 4; const v128_t vk6x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); i7 += 4; const v128_t vk7x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); i8 += 4; const v128_t vk8x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); w += 4; const v128_t vacc0123 = vacc0123p0; wasm_v128_store(output, vacc0123); output += 4; } if XNN_UNLIKELY(c != 0) { v128_t vacc0123p0 = wasm_v128_load(w); const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vk0x0123 = wasm_v128_load(w + 8); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vk0x0123), vacc0123p0); const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vk1x0123 = wasm_v128_load(w + 16); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vk1x0123), vacc0123p0); const v128_t vi2x0123 = wasm_v128_load(i2); const v128_t vk2x0123 = wasm_v128_load(w + 24); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi2x0123, vk2x0123), vacc0123p0); const v128_t vi3x0123 = wasm_v128_load(i3); const v128_t vk3x0123 = wasm_v128_load(w + 32); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi3x0123, vk3x0123), vacc0123p0); const v128_t vi4x0123 = wasm_v128_load(i4); const v128_t vk4x0123 = wasm_v128_load(w + 40); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi4x0123, vk4x0123), vacc0123p0); const v128_t vi5x0123 = wasm_v128_load(i5); const v128_t vk5x0123 = wasm_v128_load(w + 48); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi5x0123, vk5x0123), vacc0123p0); const v128_t vi6x0123 = wasm_v128_load(i6); const v128_t vk6x0123 = wasm_v128_load(w + 56); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi6x0123, vk6x0123), vacc0123p0); const v128_t vi7x0123 = wasm_v128_load(i7); const v128_t vk7x0123 = wasm_v128_load(w + 64); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi7x0123, vk7x0123), vacc0123p0); const v128_t vi8x0123 = wasm_v128_load(i8); const v128_t vk8x0123 = wasm_v128_load(w + 72); vacc0123p0 = wasm_f32x4_add(wasm_f32x4_mul(vi8x0123, vk8x0123), vacc0123p0); v128_t vacc0123 = vacc0123p0; if (c & 2) { wasm_v128_store64_lane(output, vacc0123, 0); vacc0123 = wasm_v64x2_shuffle(vacc0123, vacc0123, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vacc0123, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 1); const v128_t vmask = wasm_v128_load(params->wasmsimd_stride1.mask); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride1.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride1.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89 = wasm_v128_load64_splat(weights + 8); const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2); const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0); const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1); const v128_t vk12 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2); const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = input; const float* i2 = (const float*) ((uintptr_t) i1 + input_width); const float* i3 = (const float*) ((uintptr_t) i2 + input_width); float* o0 = output; float* o1 = (float*) ((uintptr_t) o0 + input_width); size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i2 = zero; o1 = o0; } if XNN_UNPREDICTABLE(output_height < 3) { i3 = zero; } v128_t vi0x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4; v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4; v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; size_t w = input_width; for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) { const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; v128_t vo0p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi0x4567, vk01)); v128_t vo1p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi1x4567, vk01)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk11)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk10)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk10)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); vi0x0123 = vi0x4567; vi1x0123 = vi1x4567; vi2x0123 = vi2x4567; vi3x0123 = vi3x4567; const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk12)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); vi0x4567 = vi0x89AB; vi1x4567 = vi1x89AB; vi2x4567 = vi2x89AB; vi3x4567 = vi3x89AB; v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); v128_t vo1 = wasm_f32x4_max(vo1p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); vo1 = wasm_f32x4_min(vo1, vmax); wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } // Always process the last block of 1..4 pixels. assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { vi0x4567 = wasm_v128_and(vmask, vi0x4567); vi1x4567 = wasm_v128_and(vmask, vi1x4567); vi2x4567 = wasm_v128_and(vmask, vi2x4567); vi3x4567 = wasm_v128_and(vmask, vi3x4567); v128_t vo0p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi0x4567, vk01)); v128_t vo1p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi1x4567, vk01)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk11)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk10)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk10)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk12)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); v128_t vo1 = wasm_f32x4_max(vo1p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); vo1 = wasm_f32x4_min(vo1, vmax); if XNN_LIKELY(w == 4 * sizeof(float)) { wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (2 * sizeof(float))) { wasm_v128_store64_lane(o1, vo1, 0); o1 += 2; wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); vo1 = wasm_v64x2_shuffle(vo1, vo1, 1, 1); } if (w & (1 * sizeof(float))) { wasm_v128_store32_lane(o1, vo1, 0); o1 += 1; wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); o0 = o1; o1 = (float*) ((uintptr_t) o0 + input_width); output_height = doz(output_height, 2); } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 1); const v128_t vmask = wasm_v128_load(params->wasmsimd_stride1.mask); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride1.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride1.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89 = wasm_v128_load64_splat(weights + 8); const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2); const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0); const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1); const v128_t vk12 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2); const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); const v128_t vk21 = wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0); const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = input; const float* i2 = (const float*) ((uintptr_t) i1 + input_width); const float* i3 = (const float*) ((uintptr_t) i2 + input_width); float* o0 = output; float* o1 = (float*) ((uintptr_t) o0 + input_width); size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i2 = zero; o1 = o0; } if XNN_UNPREDICTABLE(output_height < 3) { i3 = zero; } v128_t vi0x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4; v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4; v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; size_t w = input_width; for (; w > 4 * sizeof(float); w -= 4 * sizeof(float)) { const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; v128_t vo0p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi0x4567, vk01)); v128_t vo1p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi1x4567, vk01)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk11)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk10)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk10)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); vi0x0123 = vi0x4567; vi1x0123 = vi1x4567; vi2x0123 = vi2x4567; vi3x0123 = vi3x4567; const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk12)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); vi0x4567 = vi0x89AB; vi1x4567 = vi1x89AB; vi2x4567 = vi2x89AB; vi3x4567 = vi3x89AB; v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); v128_t vo1 = wasm_f32x4_pmax(vmin, vo1p0); vo0 = wasm_f32x4_pmin(vmax, vo0); vo1 = wasm_f32x4_pmin(vmax, vo1); wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } // Always process the last block of 1..4 pixels. assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { vi0x4567 = wasm_v128_and(vmask, vi0x4567); vi1x4567 = wasm_v128_and(vmask, vi1x4567); vi2x4567 = wasm_v128_and(vmask, vi2x4567); vi3x4567 = wasm_v128_and(vmask, vi3x4567); v128_t vo0p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi0x4567, vk01)); v128_t vo1p0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vi1x4567, vk01)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk11)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk21)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk21)); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk10)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk10)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk12)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); v128_t vo1 = wasm_f32x4_pmax(vmin, vo1p0); vo0 = wasm_f32x4_pmin(vmax, vo0); vo1 = wasm_f32x4_pmin(vmax, vo1); if XNN_LIKELY(w == 4 * sizeof(float)) { wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (2 * sizeof(float))) { wasm_v128_store64_lane(o1, vo1, 0); o1 += 2; wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); vo1 = wasm_v64x2_shuffle(vo1, vo1, 1, 1); } if (w & (1 * sizeof(float))) { wasm_v128_store32_lane(o1, vo1, 0); o1 += 1; wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); o0 = o1; o1 = (float*) ((uintptr_t) o0 + input_width); output_height = doz(output_height, 2); } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 0); assert(padding_top <= 1); const v128_t vmask_even = wasm_v128_load(params->wasmsimd_stride2.mask_even); const v128_t vmask_odd = wasm_v128_load(params->wasmsimd_stride2.mask_odd); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride2.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride2.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89 = wasm_v128_load64_splat(weights + 8); const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); const float* i1 = (const float*) ((uintptr_t) i0 + input_width); if XNN_UNPREDICTABLE(padding_top != 0) { i0 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_width); float* o0 = output; size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 4) { i2 = zero; } v128_t vi0x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi1x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi2x1357 = wasm_f32x4_const_splat(0.0f); size_t w = input_width; for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6); const v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7); const v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6); const v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7); const v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6); const v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)); v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); v128_t vo0p3 = wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0)); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); vi0x1357 = vi0x9BDF; const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); vi1x1357 = vi1x9BDF; const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vi2x1357 = vi2x9BDF; vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); wasm_v128_store(o0, vo0); o0 += 4; } // Last block has 0-7 pixels to process. assert(w < 8 * sizeof(float)); if XNN_LIKELY(w != 0) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); const v128_t vi0x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6)); const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7)); const v128_t vi1x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6)); const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7)); const v128_t vi2x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6)); const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7)); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)); v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); v128_t vo0p3 = wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0)); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); w += 1 * sizeof(float); if (w & (8 * sizeof(float))) { wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (4 * sizeof(float))) { wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); } if (w & (2 * sizeof(float))) { wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i0 + input_width); i2 = (const float*) ((uintptr_t) i1 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 0); assert(padding_top <= 1); const v128_t vmask_even = wasm_v128_load(params->wasmsimd_stride2.mask_even); const v128_t vmask_odd = wasm_v128_load(params->wasmsimd_stride2.mask_odd); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride2.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride2.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89 = wasm_v128_load64_splat(weights + 8); const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); const float* i1 = (const float*) ((uintptr_t) i0 + input_width); if XNN_UNPREDICTABLE(padding_top != 0) { i0 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_width); float* o0 = output; size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 4) { i2 = zero; } v128_t vi0x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi1x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi2x1357 = wasm_f32x4_const_splat(0.0f); size_t w = input_width; for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6); const v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7); const v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6); const v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7); const v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6); const v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); vi0x1357 = vi0x9BDF; const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); vi1x1357 = vi1x9BDF; const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vi2x1357 = vi2x9BDF; vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); vo0 = wasm_f32x4_pmin(vmax, vo0); wasm_v128_store(o0, vo0); o0 += 4; } // Last block has 0-7 pixels to process. assert(w < 8 * sizeof(float)); if XNN_LIKELY(w != 0) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); const v128_t vi0x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6)); const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7)); const v128_t vi1x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6)); const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7)); const v128_t vi2x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6)); const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7)); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); vo0 = wasm_f32x4_pmin(vmax, vo0); w += 1 * sizeof(float); if (w & (8 * sizeof(float))) { wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (4 * sizeof(float))) { wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); } if (w & (2 * sizeof(float))) { wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i0 + input_width); i2 = (const float*) ((uintptr_t) i1 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 2); const v128_t vmask = wasm_v128_load(params->wasmsimd_stride1.mask); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride1.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride1.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89AB = wasm_v128_load(weights + 8); const v128_t vwCDEF = wasm_v128_load(weights + 12); const v128_t vwGHIJ = wasm_v128_load(weights + 16); const v128_t vwKLMN = wasm_v128_load(weights + 20); const v128_t vwOP = wasm_v128_load64_splat(weights + 24); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = zero; const float* i2 = input; const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); const float* i5 = (const float*) ((uintptr_t) i4 + input_width); const float* i6 = (const float*) ((uintptr_t) i5 + input_width); float* o0 = output; float* o1 = (float*) ((uintptr_t) o0 + input_width); float* o2 = (float*) ((uintptr_t) o1 + input_width); size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i3 = zero; o1 = o0; } if XNN_UNPREDICTABLE(output_height < 3) { i4 = zero; o2 = o1; } if XNN_UNPREDICTABLE(output_height < 4) { i5 = zero; } if XNN_UNPREDICTABLE(output_height < 5) { i6 = zero; } v128_t vi0x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi4x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi5x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi6x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4; v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4; v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; v128_t vi6x4567 = wasm_v128_load(i6); i6 += 4; size_t w = input_width; for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; const v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4; const v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4; const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); vi0x0123 = vi0x4567; const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); vi1x0123 = vi1x4567; const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); vi2x0123 = vi2x4567; const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); vi3x0123 = vi3x4567; const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); vi4x0123 = vi4x4567; const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); vi5x0123 = vi5x4567; const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vi6x0123 = vi6x4567; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5); vi0x4567 = vi0x89AB; const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5); vi1x4567 = vi1x89AB; const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5); vi2x4567 = vi2x89AB; const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); vi3x4567 = vi3x89AB; const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); vi4x4567 = vi4x89AB; const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); vi5x4567 = vi5x89AB; const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); vi6x4567 = vi6x89AB; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); v128_t vo1 = wasm_f32x4_max(vo1p0, vmin); v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); vo1 = wasm_f32x4_min(vo1, vmax); vo2 = wasm_f32x4_min(vo2, vmax); wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } // Always process the last block of 5..8 pixels. if XNN_LIKELY(w > 4 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4; v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4; v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; vi0x89AB = wasm_v128_and(vmask, vi0x89AB); vi1x89AB = wasm_v128_and(vmask, vi1x89AB); vi2x89AB = wasm_v128_and(vmask, vi2x89AB); vi3x89AB = wasm_v128_and(vmask, vi3x89AB); vi4x89AB = wasm_v128_and(vmask, vi4x89AB); vi5x89AB = wasm_v128_and(vmask, vi5x89AB); vi6x89AB = wasm_v128_and(vmask, vi6x89AB); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); vi0x0123 = vi0x4567; const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); vi1x0123 = vi1x4567; const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); vi2x0123 = vi2x4567; const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); vi3x0123 = vi3x4567; const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); vi4x0123 = vi4x4567; const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); vi5x0123 = vi5x4567; const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vi6x0123 = vi6x4567; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5); vi0x4567 = vi0x89AB; const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5); vi1x4567 = vi1x89AB; const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5); vi2x4567 = vi2x89AB; const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); vi3x4567 = vi3x89AB; const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); vi4x4567 = vi4x89AB; const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); vi5x4567 = vi5x89AB; const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); vi6x4567 = vi6x89AB; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); v128_t vo1 = wasm_f32x4_max(vo1p0, vmin); v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); vo1 = wasm_f32x4_min(vo1, vmax); vo2 = wasm_f32x4_min(vo2, vmax); wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; w -= 4 * sizeof(float); } assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); vi0x4567 = wasm_v128_and(vmask, vi0x4567); vi1x4567 = wasm_v128_and(vmask, vi1x4567); vi2x4567 = wasm_v128_and(vmask, vi2x4567); vi3x4567 = wasm_v128_and(vmask, vi3x4567); vi4x4567 = wasm_v128_and(vmask, vi4x4567); vi5x4567 = wasm_v128_and(vmask, vi5x4567); vi6x4567 = wasm_v128_and(vmask, vi6x4567); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x5678, vzero, 1, 2, 3, 4); const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x5678, vzero, 1, 2, 3, 4); const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x5678, vzero, 1, 2, 3, 4); const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4); const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x5678, vzero, 1, 2, 3, 4); const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); v128_t vo1 = wasm_f32x4_max(vo1p0, vmin); v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); vo1 = wasm_f32x4_min(vo1, vmax); vo2 = wasm_f32x4_min(vo2, vmax); if XNN_LIKELY(w & (4 * sizeof(float))) { wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (2 * sizeof(float))) { wasm_v128_store64_lane(o2, vo2, 0); o2 += 2; wasm_v128_store64_lane(o1, vo1, 0); o1 += 2; wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); vo1 = wasm_v64x2_shuffle(vo1, vo1, 1, 1); vo2 = wasm_v64x2_shuffle(vo2, vo2, 1, 1); } if (w & (1 * sizeof(float))) { wasm_v128_store32_lane(o2, vo2, 0); o2 += 1; wasm_v128_store32_lane(o1, vo1, 0); o1 += 1; wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i3 - input_decrement); i1 = (const float*) ((uintptr_t) i4 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); i5 = (const float*) ((uintptr_t) i4 + input_width); i6 = (const float*) ((uintptr_t) i5 + input_width); o0 = o2; o1 = (float*) ((uintptr_t) o0 + input_width); o2 = (float*) ((uintptr_t) o1 + input_width); output_height = doz(output_height, 3); } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top == 2); const v128_t vmask = wasm_v128_load(params->wasmsimd_stride1.mask); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride1.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride1.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89AB = wasm_v128_load(weights + 8); const v128_t vwCDEF = wasm_v128_load(weights + 12); const v128_t vwGHIJ = wasm_v128_load(weights + 16); const v128_t vwKLMN = wasm_v128_load(weights + 20); const v128_t vwOP = wasm_v128_load64_splat(weights + 24); const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float)); const float* i0 = zero; const float* i1 = zero; const float* i2 = input; const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); const float* i5 = (const float*) ((uintptr_t) i4 + input_width); const float* i6 = (const float*) ((uintptr_t) i5 + input_width); float* o0 = output; float* o1 = (float*) ((uintptr_t) o0 + input_width); float* o2 = (float*) ((uintptr_t) o1 + input_width); size_t output_height = input_height; do { if XNN_UNPREDICTABLE(output_height < 2) { i3 = zero; o1 = o0; } if XNN_UNPREDICTABLE(output_height < 3) { i4 = zero; o2 = o1; } if XNN_UNPREDICTABLE(output_height < 4) { i5 = zero; } if XNN_UNPREDICTABLE(output_height < 5) { i6 = zero; } v128_t vi0x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi4x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi5x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi6x0123 = wasm_f32x4_const_splat(0.0f); v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4; v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4; v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; v128_t vi6x4567 = wasm_v128_load(i6); i6 += 4; size_t w = input_width; for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; const v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4; const v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4; const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); vi0x0123 = vi0x4567; const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); vi1x0123 = vi1x4567; const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); vi2x0123 = vi2x4567; const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); vi3x0123 = vi3x4567; const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); vi4x0123 = vi4x4567; const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); vi5x0123 = vi5x4567; const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vi6x0123 = vi6x4567; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5); vi0x4567 = vi0x89AB; const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5); vi1x4567 = vi1x89AB; const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5); vi2x4567 = vi2x89AB; const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); vi3x4567 = vi3x89AB; const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); vi4x4567 = vi4x89AB; const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); vi5x4567 = vi5x89AB; const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); vi6x4567 = vi6x89AB; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); v128_t vo1 = wasm_f32x4_pmax(vmin, vo1p0); v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); vo0 = wasm_f32x4_pmin(vmax, vo0); vo1 = wasm_f32x4_pmin(vmax, vo1); vo2 = wasm_f32x4_pmin(vmax, vo2); wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } // Always process the last block of 5..8 pixels. if XNN_LIKELY(w > 4 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4; v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4; v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4; v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4; v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4; v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4; v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; vi0x89AB = wasm_v128_and(vmask, vi0x89AB); vi1x89AB = wasm_v128_and(vmask, vi1x89AB); vi2x89AB = wasm_v128_and(vmask, vi2x89AB); vi3x89AB = wasm_v128_and(vmask, vi3x89AB); vi4x89AB = wasm_v128_and(vmask, vi4x89AB); vi5x89AB = wasm_v128_and(vmask, vi5x89AB); vi6x89AB = wasm_v128_and(vmask, vi6x89AB); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); vi0x0123 = vi0x4567; const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); vi1x0123 = vi1x4567; const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); vi2x0123 = vi2x4567; const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); vi3x0123 = vi3x4567; const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); vi4x0123 = vi4x4567; const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); vi5x0123 = vi5x4567; const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vi6x0123 = vi6x4567; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5); vi0x4567 = vi0x89AB; const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5); vi1x4567 = vi1x89AB; const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5); vi2x4567 = vi2x89AB; const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); vi3x4567 = vi3x89AB; const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); vi4x4567 = vi4x89AB; const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); vi5x4567 = vi5x89AB; const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); vi6x4567 = vi6x89AB; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); v128_t vo1 = wasm_f32x4_pmax(vmin, vo1p0); v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); vo0 = wasm_f32x4_pmin(vmax, vo0); vo1 = wasm_f32x4_pmin(vmax, vo1); vo2 = wasm_f32x4_pmin(vmax, vo2); wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; w -= 4 * sizeof(float); } assert(w >= 1 * sizeof(float)); assert(w <= 4 * sizeof(float)); { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); vi0x4567 = wasm_v128_and(vmask, vi0x4567); vi1x4567 = wasm_v128_and(vmask, vi1x4567); vi2x4567 = wasm_v128_and(vmask, vi2x4567); vi3x4567 = wasm_v128_and(vmask, vi3x4567); vi4x4567 = wasm_v128_and(vmask, vi4x4567); vi5x4567 = wasm_v128_and(vmask, vi5x4567); vi6x4567 = wasm_v128_and(vmask, vi6x4567); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6); const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6); const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5); const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5); const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5); const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4); const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4); const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4); const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x5678, vzero, 1, 2, 3, 4); const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x5678, vzero, 1, 2, 3, 4); const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x5678, vzero, 1, 2, 3, 4); const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4); const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x5678, vzero, 1, 2, 3, 4); const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); v128_t vo1 = wasm_f32x4_pmax(vmin, vo1p0); v128_t vo2 = wasm_f32x4_pmax(vmin, vo2p0); vo0 = wasm_f32x4_pmin(vmax, vo0); vo1 = wasm_f32x4_pmin(vmax, vo1); vo2 = wasm_f32x4_pmin(vmax, vo2); if XNN_LIKELY(w & (4 * sizeof(float))) { wasm_v128_store(o2, vo2); o2 += 4; wasm_v128_store(o1, vo1); o1 += 4; wasm_v128_store(o0, vo0); o0 += 4; } else { if (w & (2 * sizeof(float))) { wasm_v128_store64_lane(o2, vo2, 0); o2 += 2; wasm_v128_store64_lane(o1, vo1, 0); o1 += 2; wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); vo1 = wasm_v64x2_shuffle(vo1, vo1, 1, 1); vo2 = wasm_v64x2_shuffle(vo2, vo2, 1, 1); } if (w & (1 * sizeof(float))) { wasm_v128_store32_lane(o2, vo2, 0); o2 += 1; wasm_v128_store32_lane(o1, vo1, 0); o1 += 1; wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i3 - input_decrement); i1 = (const float*) ((uintptr_t) i4 - input_decrement); i2 = (const float*) ((uintptr_t) i1 + input_width); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); i5 = (const float*) ((uintptr_t) i4 + input_width); i6 = (const float*) ((uintptr_t) i5 + input_width); o0 = o2; o1 = (float*) ((uintptr_t) o0 + input_width); o2 = (float*) ((uintptr_t) o1 + input_width); output_height = doz(output_height, 3); } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 1); assert(padding_top <= 2); const v128_t vmask_even = wasm_v128_load(params->wasmsimd_stride2.mask_even); const v128_t vmask_odd = wasm_v128_load(params->wasmsimd_stride2.mask_odd); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride2.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride2.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89AB = wasm_v128_load(weights + 8); const v128_t vwCDEF = wasm_v128_load(weights + 12); const v128_t vwGHIJ = wasm_v128_load(weights + 16); const v128_t vwKLMN = wasm_v128_load(weights + 20); const v128_t vwOP = wasm_v128_load64_splat(weights + 24); const uint32_t padding_top_less_1 = padding_top - 1; const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float)); const float* i0 = zero; const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); const float* i2 = (const float*) ((uintptr_t) i1 + input_width); if XNN_UNPREDICTABLE(padding_top_less_1 != 0) { i1 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); float* o0 = output; size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */; size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 6) { i3 = zero; } if XNN_UNPREDICTABLE(padded_input_height < 7) { i4 = zero; } v128_t vi0x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi4x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi0x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi1x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi2x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi3x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi4x1357 = wasm_f32x4_const_splat(0.0f); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi3x89AB = wasm_v128_load(i3); const v128_t vi3xCDEF = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vi4x89AB = wasm_v128_load(i4); const v128_t vi4xCDEF = wasm_v128_load(i4 + 4); i4 += 8; v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6); v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7); v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6); v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7); v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6); v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7); v128_t vi3x8ACE = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 0, 2, 4, 6); v128_t vi3x9BDF = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, 7); v128_t vi4x8ACE = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 0, 2, 4, 6); v128_t vi4x9BDF = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, 7); size_t w = input_width; for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6); vi0x0246 = vi0x8ACE; const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6); vi1x0246 = vi1x8ACE; const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6); vi2x0246 = vi2x8ACE; const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6); vi3x0246 = vi3x8ACE; const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6); vi4x0246 = vi4x8ACE; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); vi0x1357 = vi0x9BDF; const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); vi1x1357 = vi1x9BDF; const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vi2x1357 = vi2x9BDF; const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6); vi3x1357 = vi3x9BDF; const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6); vi4x1357 = vi4x9BDF; const v128_t vi0xGHIJ = wasm_v128_load(i0); const v128_t vi0xKLMN = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1xGHIJ = wasm_v128_load(i1); const v128_t vi1xKLMN = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2xGHIJ = wasm_v128_load(i2); const v128_t vi2xKLMN = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi3xGHIJ = wasm_v128_load(i3); const v128_t vi3xKLMN = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vi4xGHIJ = wasm_v128_load(i4); const v128_t vi4xKLMN = wasm_v128_load(i4 + 4); i4 += 8; const v128_t vi0xGIKM = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 0, 2, 4, 6); const v128_t vi0xHJLN = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 1, 3, 5, 7); const v128_t vi1xGIKM = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 0, 2, 4, 6); const v128_t vi1xHJLN = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 1, 3, 5, 7); const v128_t vi2xGIKM = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 0, 2, 4, 6); const v128_t vi2xHJLN = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 1, 3, 5, 7); const v128_t vi3xGIKM = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 0, 2, 4, 6); const v128_t vi3xHJLN = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 1, 3, 5, 7); const v128_t vi4xGIKM = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 0, 2, 4, 6); const v128_t vi4xHJLN = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 1, 3, 5, 7); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vi0xGIKM, 1, 2, 3, 4); vi0x8ACE = vi0xGIKM; vi0x9BDF = vi0xHJLN; const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vi1xGIKM, 1, 2, 3, 4); vi1x8ACE = vi1xGIKM; vi1x9BDF = vi1xHJLN; const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vi2xGIKM, 1, 2, 3, 4); vi2x8ACE = vi2xGIKM; vi2x9BDF = vi2xHJLN; const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vi3xGIKM, 1, 2, 3, 4); vi3x8ACE = vi3xGIKM; vi3x9BDF = vi3xHJLN; const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vi4xGIKM, 1, 2, 3, 4); vi4x8ACE = vi4xGIKM; vi4x9BDF = vi4xHJLN; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); wasm_v128_store(o0, vo0); o0 += 4; } // Last block has 1-8 pixels to process. assert(w <= 8 * sizeof(float)); assert(w >= 1 * sizeof(float)); { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); vi0x8ACE = wasm_v128_and(vmask_even, vi0x8ACE); vi1x8ACE = wasm_v128_and(vmask_even, vi1x8ACE); vi2x8ACE = wasm_v128_and(vmask_even, vi2x8ACE); vi3x8ACE = wasm_v128_and(vmask_even, vi3x8ACE); vi4x8ACE = wasm_v128_and(vmask_even, vi4x8ACE); vi0x9BDF = wasm_v128_and(vmask_odd, vi0x9BDF); vi1x9BDF = wasm_v128_and(vmask_odd, vi1x9BDF); vi2x9BDF = wasm_v128_and(vmask_odd, vi2x9BDF); vi3x9BDF = wasm_v128_and(vmask_odd, vi3x9BDF); vi4x9BDF = wasm_v128_and(vmask_odd, vi4x9BDF); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6); const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6); const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6); const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6); const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6); const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vzero, 1, 2, 3, 4); const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vzero, 1, 2, 3, 4); const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vzero, 1, 2, 3, 4); const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vzero, 1, 2, 3, 4); const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_max(vo0p0, vmin); vo0 = wasm_f32x4_min(vo0, vmax); size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float)); if XNN_LIKELY(w_tmp >= 4) { wasm_v128_store(o0, vo0); o0 += 4; } else { if (w_tmp & 2) { wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); } if (w_tmp & 1) { wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i4 - input_decrement); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2( size_t input_height, size_t input_width, const float* input, const float* weights, const float* zero, float* output, uint32_t padding_top, const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(input_height != 0); assert(input_width != 0); assert(input_width % sizeof(float) == 0); assert(padding_top >= 1); assert(padding_top <= 2); const v128_t vmask_even = wasm_v128_load(params->wasmsimd_stride2.mask_even); const v128_t vmask_odd = wasm_v128_load(params->wasmsimd_stride2.mask_odd); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd_stride2.max); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd_stride2.min); const v128_t vw0123 = wasm_v128_load(weights); const v128_t vw4567 = wasm_v128_load(weights + 4); const v128_t vw89AB = wasm_v128_load(weights + 8); const v128_t vwCDEF = wasm_v128_load(weights + 12); const v128_t vwGHIJ = wasm_v128_load(weights + 16); const v128_t vwKLMN = wasm_v128_load(weights + 20); const v128_t vwOP = wasm_v128_load64_splat(weights + 24); const uint32_t padding_top_less_1 = padding_top - 1; const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float)); const float* i0 = zero; const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width)); const float* i2 = (const float*) ((uintptr_t) i1 + input_width); if XNN_UNPREDICTABLE(padding_top_less_1 != 0) { i1 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_width); const float* i4 = (const float*) ((uintptr_t) i3 + input_width); float* o0 = output; size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */; size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2; do { if XNN_UNPREDICTABLE(padded_input_height < 6) { i3 = zero; } if XNN_UNPREDICTABLE(padded_input_height < 7) { i4 = zero; } v128_t vi0x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi1x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi2x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi3x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi4x0246 = wasm_f32x4_const_splat(0.0f); v128_t vi0x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi1x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi2x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi3x1357 = wasm_f32x4_const_splat(0.0f); v128_t vi4x1357 = wasm_f32x4_const_splat(0.0f); const v128_t vi0x89AB = wasm_v128_load(i0); const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1x89AB = wasm_v128_load(i1); const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2x89AB = wasm_v128_load(i2); const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi3x89AB = wasm_v128_load(i3); const v128_t vi3xCDEF = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vi4x89AB = wasm_v128_load(i4); const v128_t vi4xCDEF = wasm_v128_load(i4 + 4); i4 += 8; v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6); v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7); v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6); v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7); v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6); v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7); v128_t vi3x8ACE = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 0, 2, 4, 6); v128_t vi3x9BDF = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, 7); v128_t vi4x8ACE = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 0, 2, 4, 6); v128_t vi4x9BDF = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, 7); size_t w = input_width; for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6); vi0x0246 = vi0x8ACE; const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6); vi1x0246 = vi1x8ACE; const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6); vi2x0246 = vi2x8ACE; const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6); vi3x0246 = vi3x8ACE; const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6); vi4x0246 = vi4x8ACE; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); vi0x1357 = vi0x9BDF; const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); vi1x1357 = vi1x9BDF; const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); vi2x1357 = vi2x9BDF; const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6); vi3x1357 = vi3x9BDF; const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6); vi4x1357 = vi4x9BDF; const v128_t vi0xGHIJ = wasm_v128_load(i0); const v128_t vi0xKLMN = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1xGHIJ = wasm_v128_load(i1); const v128_t vi1xKLMN = wasm_v128_load(i1 + 4); i1 += 8; const v128_t vi2xGHIJ = wasm_v128_load(i2); const v128_t vi2xKLMN = wasm_v128_load(i2 + 4); i2 += 8; const v128_t vi3xGHIJ = wasm_v128_load(i3); const v128_t vi3xKLMN = wasm_v128_load(i3 + 4); i3 += 8; const v128_t vi4xGHIJ = wasm_v128_load(i4); const v128_t vi4xKLMN = wasm_v128_load(i4 + 4); i4 += 8; const v128_t vi0xGIKM = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 0, 2, 4, 6); const v128_t vi0xHJLN = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 1, 3, 5, 7); const v128_t vi1xGIKM = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 0, 2, 4, 6); const v128_t vi1xHJLN = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 1, 3, 5, 7); const v128_t vi2xGIKM = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 0, 2, 4, 6); const v128_t vi2xHJLN = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 1, 3, 5, 7); const v128_t vi3xGIKM = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 0, 2, 4, 6); const v128_t vi3xHJLN = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 1, 3, 5, 7); const v128_t vi4xGIKM = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 0, 2, 4, 6); const v128_t vi4xHJLN = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 1, 3, 5, 7); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vi0xGIKM, 1, 2, 3, 4); vi0x8ACE = vi0xGIKM; vi0x9BDF = vi0xHJLN; const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vi1xGIKM, 1, 2, 3, 4); vi1x8ACE = vi1xGIKM; vi1x9BDF = vi1xHJLN; const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vi2xGIKM, 1, 2, 3, 4); vi2x8ACE = vi2xGIKM; vi2x9BDF = vi2xHJLN; const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vi3xGIKM, 1, 2, 3, 4); vi3x8ACE = vi3xGIKM; vi3x9BDF = vi3xHJLN; const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vi4xGIKM, 1, 2, 3, 4); vi4x8ACE = vi4xGIKM; vi4x9BDF = vi4xHJLN; vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); vo0 = wasm_f32x4_pmin(vmax, vo0); wasm_v128_store(o0, vo0); o0 += 4; } // Last block has 1-8 pixels to process. assert(w <= 8 * sizeof(float)); assert(w >= 1 * sizeof(float)); { v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); vi0x8ACE = wasm_v128_and(vmask_even, vi0x8ACE); vi1x8ACE = wasm_v128_and(vmask_even, vi1x8ACE); vi2x8ACE = wasm_v128_and(vmask_even, vi2x8ACE); vi3x8ACE = wasm_v128_and(vmask_even, vi3x8ACE); vi4x8ACE = wasm_v128_and(vmask_even, vi4x8ACE); vi0x9BDF = wasm_v128_and(vmask_odd, vi0x9BDF); vi1x9BDF = wasm_v128_and(vmask_odd, vi1x9BDF); vi2x9BDF = wasm_v128_and(vmask_odd, vi2x9BDF); vi3x9BDF = wasm_v128_and(vmask_odd, vi3x9BDF); vi4x9BDF = wasm_v128_and(vmask_odd, vi4x9BDF); v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))); const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6); const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6); const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6); const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6); const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1))); const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6); const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6); const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6); const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6); const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2))); const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vzero, 1, 2, 3, 4); const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vzero, 1, 2, 3, 4); const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vzero, 1, 2, 3, 4); const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vzero, 1, 2, 3, 4); const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vzero, 1, 2, 3, 4); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3))); vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0))); vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))); vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); v128_t vo0 = wasm_f32x4_pmax(vmin, vo0p0); vo0 = wasm_f32x4_pmin(vmax, vo0); size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float)); if XNN_LIKELY(w_tmp >= 4) { wasm_v128_store(o0, vo0); o0 += 4; } else { if (w_tmp & 2) { wasm_v128_store64_lane(o0, vo0, 0); o0 += 2; vo0 = wasm_v64x2_shuffle(vo0, vo0, 1, 1); } if (w_tmp & 1) { wasm_v128_store32_lane(o0, vo0, 0); o0 += 1; } } } i0 = (const float*) ((uintptr_t) i2 - input_decrement); i1 = (const float*) ((uintptr_t) i3 - input_decrement); i2 = (const float*) ((uintptr_t) i4 - input_decrement); i3 = (const float*) ((uintptr_t) i2 + input_width); i4 = (const float*) ((uintptr_t) i3 + input_width); output_height -= 1; padded_input_height -= 2; } while (output_height != 0); } void xnn_f32_f16_vcvt_ukernel__wasmsimd_x24( size_t batch, const float* input, void* output, const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vexp_bias = wasm_v128_load64_splat(params->wasmsimd.exp_bias); const v128_t vscale_to_inf = wasm_v128_load64_splat(params->wasmsimd.scale_to_inf); const v128_t vexpw_max = wasm_v128_load64_splat(params->wasmsimd.expw_max); const v128_t vscale_to_zero = wasm_v128_load64_splat(params->wasmsimd.scale_to_zero); const v128_t vbias_min = wasm_v128_load64_splat(params->wasmsimd.bias_min); const v128_t vmanth_mask = wasm_v128_load64_splat(params->wasmsimd.manth_mask); const v128_t vexph_mask = wasm_v128_load64_splat(params->wasmsimd.exph_mask); const v128_t vnanh = wasm_v128_load64_splat(params->wasmsimd.nanh); uint16_t* o = (uint16_t*) output; for (; batch >= 24 * sizeof(float); batch -= 24 * sizeof(float)) { const v128_t vx0 = wasm_v128_load(input); const v128_t vx1 = wasm_v128_load(input + 4); const v128_t vx2 = wasm_v128_load(input + 8); const v128_t vx3 = wasm_v128_load(input + 12); const v128_t vx4 = wasm_v128_load(input + 16); const v128_t vx5 = wasm_v128_load(input + 20); input += 24; const v128_t vabsx0 = wasm_f32x4_abs(vx0); const v128_t vabsx1 = wasm_f32x4_abs(vx1); const v128_t vabsx2 = wasm_f32x4_abs(vx2); const v128_t vabsx3 = wasm_f32x4_abs(vx3); const v128_t vabsx4 = wasm_f32x4_abs(vx4); const v128_t vabsx5 = wasm_f32x4_abs(vx5); const v128_t vsignx0 = wasm_v128_xor(vx0, vabsx0); const v128_t vsignx1 = wasm_v128_xor(vx1, vabsx1); const v128_t vsignx2 = wasm_v128_xor(vx2, vabsx2); const v128_t vsignx3 = wasm_v128_xor(vx3, vabsx3); const v128_t vsignx4 = wasm_v128_xor(vx4, vabsx4); const v128_t vsignx5 = wasm_v128_xor(vx5, vabsx5); v128_t vbias0 = wasm_i32x4_add(vabsx0, vexp_bias); v128_t vbias1 = wasm_i32x4_add(vabsx1, vexp_bias); v128_t vbias2 = wasm_i32x4_add(vabsx2, vexp_bias); v128_t vbias3 = wasm_i32x4_add(vabsx3, vexp_bias); v128_t vbias4 = wasm_i32x4_add(vabsx4, vexp_bias); v128_t vbias5 = wasm_i32x4_add(vabsx5, vexp_bias); v128_t vf0 = wasm_f32x4_mul(vabsx0, vscale_to_inf); v128_t vf1 = wasm_f32x4_mul(vabsx1, vscale_to_inf); v128_t vf2 = wasm_f32x4_mul(vabsx2, vscale_to_inf); v128_t vf3 = wasm_f32x4_mul(vabsx3, vscale_to_inf); v128_t vf4 = wasm_f32x4_mul(vabsx4, vscale_to_inf); v128_t vf5 = wasm_f32x4_mul(vabsx5, vscale_to_inf); const v128_t vnanmaskw0 = wasm_i32x4_gt(vabsx0, vexpw_max); const v128_t vnanmaskw1 = wasm_i32x4_gt(vabsx1, vexpw_max); const v128_t vnanmaskw2 = wasm_i32x4_gt(vabsx2, vexpw_max); const v128_t vnanmaskw3 = wasm_i32x4_gt(vabsx3, vexpw_max); const v128_t vnanmaskw4 = wasm_i32x4_gt(vabsx4, vexpw_max); const v128_t vnanmaskw5 = wasm_i32x4_gt(vabsx5, vexpw_max); vbias0 = wasm_v128_and(vbias0, vexpw_max); vbias1 = wasm_v128_and(vbias1, vexpw_max); vbias2 = wasm_v128_and(vbias2, vexpw_max); vbias3 = wasm_v128_and(vbias3, vexpw_max); vbias4 = wasm_v128_and(vbias4, vexpw_max); vbias5 = wasm_v128_and(vbias5, vexpw_max); vf0 = wasm_f32x4_mul(vf0, vscale_to_zero); vf1 = wasm_f32x4_mul(vf1, vscale_to_zero); vf2 = wasm_f32x4_mul(vf2, vscale_to_zero); vf3 = wasm_f32x4_mul(vf3, vscale_to_zero); vf4 = wasm_f32x4_mul(vf4, vscale_to_zero); vf5 = wasm_f32x4_mul(vf5, vscale_to_zero); const v128_t vnanmaskh0 = wasm_i16x8_narrow_i32x4(vnanmaskw0, vnanmaskw1); const v128_t vnanmaskh1 = wasm_i16x8_narrow_i32x4(vnanmaskw2, vnanmaskw3); const v128_t vnanmaskh2 = wasm_i16x8_narrow_i32x4(vnanmaskw4, vnanmaskw5); const v128_t vsignh0 = wasm_i16x8_narrow_i32x4(vsignx0, vsignx1); const v128_t vsignh1 = wasm_i16x8_narrow_i32x4(vsignx2, vsignx3); const v128_t vsignh2 = wasm_i16x8_narrow_i32x4(vsignx4, vsignx5); vbias0 = wasm_i16x8_max(vbias0, vbias_min); vbias1 = wasm_i16x8_max(vbias1, vbias_min); vbias2 = wasm_i16x8_max(vbias2, vbias_min); vbias3 = wasm_i16x8_max(vbias3, vbias_min); vbias4 = wasm_i16x8_max(vbias4, vbias_min); vbias5 = wasm_i16x8_max(vbias5, vbias_min); vf0 = wasm_f32x4_add(vf0, vbias0); vf1 = wasm_f32x4_add(vf1, vbias1); vf2 = wasm_f32x4_add(vf2, vbias2); vf3 = wasm_f32x4_add(vf3, vbias3); vf4 = wasm_f32x4_add(vf4, vbias4); vf5 = wasm_f32x4_add(vf5, vbias5); v128_t vexpw0 = wasm_i32x4_shr(vf0, 13); v128_t vexpw1 = wasm_i32x4_shr(vf1, 13); v128_t vexpw2 = wasm_i32x4_shr(vf2, 13); v128_t vexpw3 = wasm_i32x4_shr(vf3, 13); v128_t vexpw4 = wasm_i32x4_shr(vf4, 13); v128_t vexpw5 = wasm_i32x4_shr(vf5, 13); const v128_t vmantw0 = wasm_v128_and(vf0, vmanth_mask); const v128_t vmantw1 = wasm_v128_and(vf1, vmanth_mask); const v128_t vmantw2 = wasm_v128_and(vf2, vmanth_mask); const v128_t vmantw3 = wasm_v128_and(vf3, vmanth_mask); const v128_t vmantw4 = wasm_v128_and(vf4, vmanth_mask); const v128_t vmantw5 = wasm_v128_and(vf5, vmanth_mask); vexpw0 = wasm_v128_and(vexpw0, vexph_mask); vexpw1 = wasm_v128_and(vexpw1, vexph_mask); vexpw2 = wasm_v128_and(vexpw2, vexph_mask); vexpw3 = wasm_v128_and(vexpw3, vexph_mask); vexpw4 = wasm_v128_and(vexpw4, vexph_mask); vexpw5 = wasm_v128_and(vexpw5, vexph_mask); const v128_t vnonsignw0 = wasm_i32x4_add(vmantw0, vexpw0); const v128_t vnonsignw1 = wasm_i32x4_add(vmantw1, vexpw1); const v128_t vnonsignw2 = wasm_i32x4_add(vmantw2, vexpw2); const v128_t vnonsignw3 = wasm_i32x4_add(vmantw3, vexpw3); const v128_t vnonsignw4 = wasm_i32x4_add(vmantw4, vexpw4); const v128_t vnonsignw5 = wasm_i32x4_add(vmantw5, vexpw5); const v128_t vnonsignh0 = wasm_i16x8_narrow_i32x4(vnonsignw0, vnonsignw1); const v128_t vnonsignh1 = wasm_i16x8_narrow_i32x4(vnonsignw2, vnonsignw3); const v128_t vnonsignh2 = wasm_i16x8_narrow_i32x4(vnonsignw4, vnonsignw5); const v128_t vabsh0 = wasm_v128_bitselect(vnanh, vnonsignh0, vnanmaskh0); const v128_t vabsh1 = wasm_v128_bitselect(vnanh, vnonsignh1, vnanmaskh1); const v128_t vabsh2 = wasm_v128_bitselect(vnanh, vnonsignh2, vnanmaskh2); const v128_t vh0 = wasm_v128_or(vabsh0, vsignh0); const v128_t vh1 = wasm_v128_or(vabsh1, vsignh1); const v128_t vh2 = wasm_v128_or(vabsh2, vsignh2); wasm_v128_store(o, vh0); wasm_v128_store(o + 8, vh1); wasm_v128_store(o + 16, vh2); o += 24; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx_lo = wasm_v128_load(input); const v128_t vx_hi = wasm_v128_load(input + 4); input += 8; const v128_t vabsx_lo = wasm_f32x4_abs(vx_lo); const v128_t vabsx_hi = wasm_f32x4_abs(vx_hi); const v128_t vsignx_lo = wasm_v128_xor(vx_lo, vabsx_lo); const v128_t vsignx_hi = wasm_v128_xor(vx_hi, vabsx_hi); v128_t vbias_lo = wasm_i32x4_add(vabsx_lo, vexp_bias); v128_t vbias_hi = wasm_i32x4_add(vabsx_hi, vexp_bias); v128_t vf_lo = wasm_f32x4_mul(vabsx_lo, vscale_to_inf); v128_t vf_hi = wasm_f32x4_mul(vabsx_hi, vscale_to_inf); const v128_t vnanmaskw_lo = wasm_i32x4_gt(vabsx_lo, vexpw_max); const v128_t vnanmaskw_hi = wasm_i32x4_gt(vabsx_hi, vexpw_max); vbias_lo = wasm_v128_and(vbias_lo, vexpw_max); vbias_hi = wasm_v128_and(vbias_hi, vexpw_max); vf_lo = wasm_f32x4_mul(vf_lo, vscale_to_zero); vf_hi = wasm_f32x4_mul(vf_hi, vscale_to_zero); const v128_t vnanmaskh = wasm_i16x8_narrow_i32x4(vnanmaskw_lo, vnanmaskw_hi); const v128_t vsignh = wasm_i16x8_narrow_i32x4(vsignx_lo, vsignx_hi); vbias_lo = wasm_i16x8_max(vbias_lo, vbias_min); vbias_hi = wasm_i16x8_max(vbias_hi, vbias_min); vf_lo = wasm_f32x4_add(vf_lo, vbias_lo); vf_hi = wasm_f32x4_add(vf_hi, vbias_hi); v128_t vexpw_lo = wasm_i32x4_shr(vf_lo, 13); v128_t vexpw_hi = wasm_i32x4_shr(vf_hi, 13); const v128_t vmantw_lo = wasm_v128_and(vf_lo, vmanth_mask); const v128_t vmantw_hi = wasm_v128_and(vf_hi, vmanth_mask); vexpw_lo = wasm_v128_and(vexpw_lo, vexph_mask); vexpw_hi = wasm_v128_and(vexpw_hi, vexph_mask); const v128_t vnonsignw_lo = wasm_i32x4_add(vmantw_lo, vexpw_lo); const v128_t vnonsignw_hi = wasm_i32x4_add(vmantw_hi, vexpw_hi); const v128_t vnonsignh = wasm_i16x8_narrow_i32x4(vnonsignw_lo, vnonsignw_hi); const v128_t vabsh = wasm_v128_bitselect(vnanh, vnonsignh, vnanmaskh); const v128_t vh = wasm_v128_or(vabsh, vsignh); wasm_v128_store(o, vh); o += 8; } if XNN_UNPREDICTABLE(batch != 0) { const v128_t vx_lo = wasm_v128_load(input); const float* input_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float)))); const v128_t vx_hi = wasm_v128_load(input_hi); const v128_t vabsx_lo = wasm_f32x4_abs(vx_lo); const v128_t vabsx_hi = wasm_f32x4_abs(vx_hi); const v128_t vsignx_lo = wasm_v128_xor(vx_lo, vabsx_lo); const v128_t vsignx_hi = wasm_v128_xor(vx_hi, vabsx_hi); v128_t vbias_lo = wasm_i32x4_add(vabsx_lo, vexp_bias); v128_t vbias_hi = wasm_i32x4_add(vabsx_hi, vexp_bias); v128_t vf_lo = wasm_f32x4_mul(vabsx_lo, vscale_to_inf); v128_t vf_hi = wasm_f32x4_mul(vabsx_hi, vscale_to_inf); const v128_t vnanmaskw_lo = wasm_i32x4_gt(vabsx_lo, vexpw_max); const v128_t vnanmaskw_hi = wasm_i32x4_gt(vabsx_hi, vexpw_max); vbias_lo = wasm_v128_and(vbias_lo, vexpw_max); vbias_hi = wasm_v128_and(vbias_hi, vexpw_max); vf_lo = wasm_f32x4_mul(vf_lo, vscale_to_zero); vf_hi = wasm_f32x4_mul(vf_hi, vscale_to_zero); const v128_t vnanmaskh = wasm_i16x8_narrow_i32x4(vnanmaskw_lo, vnanmaskw_hi); const v128_t vsignh = wasm_i16x8_narrow_i32x4(vsignx_lo, vsignx_hi); vbias_lo = wasm_i16x8_max(vbias_lo, vbias_min); vbias_hi = wasm_i16x8_max(vbias_hi, vbias_min); vf_lo = wasm_f32x4_add(vf_lo, vbias_lo); vf_hi = wasm_f32x4_add(vf_hi, vbias_hi); v128_t vexpw_lo = wasm_i32x4_shr(vf_lo, 13); v128_t vexpw_hi = wasm_i32x4_shr(vf_hi, 13); const v128_t vmantw_lo = wasm_v128_and(vf_lo, vmanth_mask); const v128_t vmantw_hi = wasm_v128_and(vf_hi, vmanth_mask); vexpw_lo = wasm_v128_and(vexpw_lo, vexph_mask); vexpw_hi = wasm_v128_and(vexpw_hi, vexph_mask); const v128_t vnonsignw_lo = wasm_i32x4_add(vmantw_lo, vexpw_lo); const v128_t vnonsignw_hi = wasm_i32x4_add(vmantw_hi, vexpw_hi); const v128_t vnonsignh = wasm_i16x8_narrow_i32x4(vnonsignw_lo, vnonsignw_hi); const v128_t vabsh = wasm_v128_bitselect(vnanh, vnonsignh, vnanmaskh); v128_t vh = wasm_v128_or(vabsh, vsignh); if (batch & (4 * sizeof(float))) { wasm_v128_store64_lane(o, vh, 0); vh = wasm_v64x2_shuffle(vh, vh, 1, 1); o += 4; } if (batch & (2 * sizeof(float))) { wasm_v128_store32_lane(o, vh, 0); vh = wasm_i64x2_shr(vh, 32); o += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store16_lane(o, vh, 0); } } } void xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4( size_t elements, size_t channels, const float* input, float* output, const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(elements != 0); assert(elements % sizeof(float) == 0); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + elements); const float* i2 = (const float*) ((uintptr_t) i1 + elements); const float* i3 = (const float*) ((uintptr_t) i2 + elements); const v128_t vmask = wasm_v128_load(params->scalar.mask); const v128_t vmultiplier = wasm_v128_load32_splat(¶ms->scalar.multiplier); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.output_min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.output_max); while (channels >= 4) { v128_t vsum0 = wasm_f32x4_const_splat(0.0f); v128_t vsum1 = vsum0; v128_t vsum2 = vsum0; v128_t vsum3 = vsum0; size_t n = elements; while (n >= 4 * sizeof(float)) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; vsum0 = wasm_f32x4_add(vsum0, vi0); vsum1 = wasm_f32x4_add(vsum1, vi1); vsum2 = wasm_f32x4_add(vsum2, vi2); vsum3 = wasm_f32x4_add(vsum3, vi3); n -= 4 * sizeof(float); } if XNN_UNLIKELY(n != 0) { const v128_t vi0 = wasm_v128_and(wasm_v128_load(i0), vmask); i0 = (const float*) ((uintptr_t) i0 + n); const v128_t vi1 = wasm_v128_and(wasm_v128_load(i1), vmask); i1 = (const float*) ((uintptr_t) i1 + n); const v128_t vi2 = wasm_v128_and(wasm_v128_load(i2), vmask); i2 = (const float*) ((uintptr_t) i2 + n); const v128_t vi3 = wasm_v128_and(wasm_v128_load(i3), vmask); i3 = (const float*) ((uintptr_t) i3 + n); vsum0 = wasm_f32x4_add(vsum0, vi0); vsum1 = wasm_f32x4_add(vsum1, vi1); vsum2 = wasm_f32x4_add(vsum2, vi2); vsum3 = wasm_f32x4_add(vsum3, vi3); } // Having exactly 4 rows makes this work out nicely as we end up with // the 4 totals in 4 different lanes of the same vector. const v128_t vsum01 = wasm_f32x4_add(wasm_v32x4_shuffle(vsum0, vsum1, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum0, vsum1, 1, 3, 5, 7)); const v128_t vsum23 = wasm_f32x4_add(wasm_v32x4_shuffle(vsum2, vsum3, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum2, vsum3, 1, 3, 5, 7)); const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum01, vsum23, 1, 3, 5, 7)); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; i0 = i3; i1 = (const float*) ((uintptr_t) i0 + elements); i2 = (const float*) ((uintptr_t) i1 + elements); i3 = (const float*) ((uintptr_t) i2 + elements); channels -= 4; } while (channels != 0) { v128_t vsum = wasm_f32x4_const_splat(0.0f); size_t n = elements; while (n >= 4 * sizeof(float)) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; vsum = wasm_f32x4_add(vsum, vi0); n -= 4 * sizeof(float); } if XNN_UNLIKELY(n != 0) { v128_t vi0 = wasm_v128_and(vmask, wasm_v128_load(i0)); i0 = (const float*) ((uintptr_t) i0 + n); vsum = wasm_f32x4_add(vsum, vi0); } vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1, 3, 5, 7)); vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1, 3, 5, 7)); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); *output++ = wasm_f32x4_extract_lane(vout, 0); channels -= 1; } } void xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4( size_t elements, size_t channels, const float* input, float* output, const union xnn_f32_gavgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(elements != 0); assert(elements % sizeof(float) == 0); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + elements); const float* i2 = (const float*) ((uintptr_t) i1 + elements); const float* i3 = (const float*) ((uintptr_t) i2 + elements); const v128_t vmask = wasm_v128_load(params->scalar.mask); const v128_t vmultiplier = wasm_v128_load32_splat(¶ms->scalar.multiplier); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.output_min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.output_max); while (channels >= 4) { v128_t vsum0 = wasm_f32x4_const_splat(0.0f); v128_t vsum1 = vsum0; v128_t vsum2 = vsum0; v128_t vsum3 = vsum0; size_t n = elements; while (n >= 4 * sizeof(float)) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; vsum0 = wasm_f32x4_add(vsum0, vi0); vsum1 = wasm_f32x4_add(vsum1, vi1); vsum2 = wasm_f32x4_add(vsum2, vi2); vsum3 = wasm_f32x4_add(vsum3, vi3); n -= 4 * sizeof(float); } if XNN_UNLIKELY(n != 0) { const v128_t vi0 = wasm_v128_and(wasm_v128_load(i0), vmask); i0 = (const float*) ((uintptr_t) i0 + n); const v128_t vi1 = wasm_v128_and(wasm_v128_load(i1), vmask); i1 = (const float*) ((uintptr_t) i1 + n); const v128_t vi2 = wasm_v128_and(wasm_v128_load(i2), vmask); i2 = (const float*) ((uintptr_t) i2 + n); const v128_t vi3 = wasm_v128_and(wasm_v128_load(i3), vmask); i3 = (const float*) ((uintptr_t) i3 + n); vsum0 = wasm_f32x4_add(vsum0, vi0); vsum1 = wasm_f32x4_add(vsum1, vi1); vsum2 = wasm_f32x4_add(vsum2, vi2); vsum3 = wasm_f32x4_add(vsum3, vi3); } // Having exactly 4 rows makes this work out nicely as we end up with // the 4 totals in 4 different lanes of the same vector. const v128_t vsum01 = wasm_f32x4_add(wasm_v32x4_shuffle(vsum0, vsum1, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum0, vsum1, 1, 3, 5, 7)); const v128_t vsum23 = wasm_f32x4_add(wasm_v32x4_shuffle(vsum2, vsum3, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum2, vsum3, 1, 3, 5, 7)); const v128_t vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum01, vsum23, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum01, vsum23, 1, 3, 5, 7)); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmin(vmax, vout); vout = wasm_f32x4_pmax(vmin, vout); wasm_v128_store(output, vout); output += 4; i0 = i3; i1 = (const float*) ((uintptr_t) i0 + elements); i2 = (const float*) ((uintptr_t) i1 + elements); i3 = (const float*) ((uintptr_t) i2 + elements); channels -= 4; } while (channels != 0) { v128_t vsum = wasm_f32x4_const_splat(0.0f); size_t n = elements; while (n >= 4 * sizeof(float)) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; vsum = wasm_f32x4_add(vsum, vi0); n -= 4 * sizeof(float); } if XNN_UNLIKELY(n != 0) { v128_t vi0 = wasm_v128_and(vmask, wasm_v128_load(i0)); i0 = (const float*) ((uintptr_t) i0 + n); vsum = wasm_f32x4_add(vsum, vi0); } vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1, 3, 5, 7)); vsum = wasm_f32x4_add(wasm_v32x4_shuffle(vsum, vsum, 0, 2, 4, 6), wasm_v32x4_shuffle(vsum, vsum, 1, 3, 5, 7)); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmin(vmax, vout); vout = wasm_f32x4_pmax(vmin, vout); *output++ = wasm_f32x4_extract_lane(vout, 0); channels -= 1; } } void xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* buffer, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); const size_t packed_channels = round_up_po2(channels, 4); const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); wasm_v128_store(b, vsum); b += 4; } for (rows -= 7; rows > 7; rows -= 7) { b = buffer; i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); i2 = (const float*) ((uintptr_t) i2 + input_increment); i3 = (const float*) ((uintptr_t) i3 + input_increment); i4 = (const float*) ((uintptr_t) i4 + input_increment); i5 = (const float*) ((uintptr_t) i5 + input_increment); i6 = (const float*) ((uintptr_t) i6 + input_increment); for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); wasm_v128_store(b, vsum); b += 4; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); if (rows < 2) { i1 = zero; } i2 = (const float*) ((uintptr_t) i2 + input_increment); if (rows <= 2) { i2 = zero; } i3 = (const float*) ((uintptr_t) i3 + input_increment); if (rows < 4) { i3 = zero; } i4 = (const float*) ((uintptr_t) i4 + input_increment); if (rows <= 4) { i4 = zero; } i5 = (const float*) ((uintptr_t) i5 + input_increment); if (rows < 6) { i5 = zero; } i6 = (const float*) ((uintptr_t) i6 + input_increment); if (rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); b = buffer; while (channels >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; channels -= 4; } if (channels != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (channels & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (channels & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } void xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* buffer, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); const size_t packed_channels = round_up_po2(channels, 4); const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); wasm_v128_store(b, vsum); b += 4; } for (rows -= 7; rows > 7; rows -= 7) { b = buffer; i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); i2 = (const float*) ((uintptr_t) i2 + input_increment); i3 = (const float*) ((uintptr_t) i3 + input_increment); i4 = (const float*) ((uintptr_t) i4 + input_increment); i5 = (const float*) ((uintptr_t) i5 + input_increment); i6 = (const float*) ((uintptr_t) i6 + input_increment); for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); wasm_v128_store(b, vsum); b += 4; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); if (rows < 2) { i1 = zero; } i2 = (const float*) ((uintptr_t) i2 + input_increment); if (rows <= 2) { i2 = zero; } i3 = (const float*) ((uintptr_t) i3 + input_increment); if (rows < 4) { i3 = zero; } i4 = (const float*) ((uintptr_t) i4 + input_increment); if (rows <= 4) { i4 = zero; } i5 = (const float*) ((uintptr_t) i5 + input_increment); if (rows < 6) { i5 = zero; } i6 = (const float*) ((uintptr_t) i6 + input_increment); if (rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); b = buffer; while (channels >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; channels -= 4; } if (channels != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (channels & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (channels & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } void xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); if (rows < 2) { i1 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); if (rows <= 2) { i2 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); if (rows < 4) { i3 = zero; } const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); if (rows <= 4) { i4 = zero; } const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); if (rows < 6) { i5 = zero; } const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); if (rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); while (channels >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; channels -= 4; } if (channels != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (channels & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (channels & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } void xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4( size_t rows, size_t channels, const float* input, size_t input_stride, const float* zero, float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const float* i0 = input; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); if (rows < 2) { i1 = zero; } const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); if (rows <= 2) { i2 = zero; } const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); if (rows < 4) { i3 = zero; } const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); if (rows <= 4) { i4 = zero; } const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); if (rows < 6) { i5 = zero; } const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); if (rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); while (channels >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; channels -= 4; } if (channels != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); v128_t vout = wasm_f32x4_mul(vsum, vscale); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (channels & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (channels & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } void xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); a0 = (const float*) ((uintptr_t) a0 + k); const v128_t va1 = wasm_v128_load(a1); a1 = (const float*) ((uintptr_t) a1 + k); const v128_t va2 = wasm_v128_load(a2); a2 = (const float*) ((uintptr_t) a2 + k); const v128_t va3 = wasm_v128_load(a3); a3 = (const float*) ((uintptr_t) a3 + k); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); vacc01x01 = wasm_f32x4_max(vmin, vacc01x01); vacc23x01 = wasm_f32x4_max(vmin, vacc23x01); vacc01x01 = wasm_f32x4_min(vmax, vacc01x01); vacc23x01 = wasm_f32x4_min(vmax, vacc23x01); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); a2 = (const float*) ((uintptr_t) a2 - kc); wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); a1 = (const float*) ((uintptr_t) a1 - kc); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); a0 = (const float*) ((uintptr_t) a0 + k); const v128_t va1 = wasm_v128_load(a1); a1 = (const float*) ((uintptr_t) a1 + k); const v128_t va2 = wasm_v128_load(a2); a2 = (const float*) ((uintptr_t) a2 + k); const v128_t va3 = wasm_v128_load(a3); a3 = (const float*) ((uintptr_t) a3 + k); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); vacc01x01 = wasm_f32x4_pmax(vmin, vacc01x01); vacc23x01 = wasm_f32x4_pmax(vmin, vacc23x01); vacc01x01 = wasm_f32x4_pmin(vmax, vacc01x01); vacc23x01 = wasm_f32x4_pmin(vmax, vacc23x01); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); a2 = (const float*) ((uintptr_t) a2 - kc); wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); a1 = (const float*) ((uintptr_t) a1 - kc); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); nc = 0; } } while (nc != 0); } void xnn_f32_gemm_ukernel_4x2c4__wasmsimd( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); a0 = (const float*) ((uintptr_t) a0 + k); const v128_t va1 = wasm_v128_load(a1); a1 = (const float*) ((uintptr_t) a1 + k); const v128_t va2 = wasm_v128_load(a2); a2 = (const float*) ((uintptr_t) a2 + k); const v128_t va3 = wasm_v128_load(a3); a3 = (const float*) ((uintptr_t) a3 + k); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); a2 = (const float*) ((uintptr_t) a2 - kc); wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); a1 = (const float*) ((uintptr_t) a1 - kc); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_pmax(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_pmax(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_pmax(vmin, vacc3x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_pmax(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_pmax(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_pmax(vmin, vacc3x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_pmin(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_pmin(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_pmin(vmax, vacc3x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_pmin(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_pmin(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_pmin(vmax, vacc3x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_max(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_max(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_max(vmin, vacc3x0123); vacc4x0123 = wasm_f32x4_max(vmin, vacc4x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_max(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_max(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_max(vmin, vacc3x4567); vacc4x4567 = wasm_f32x4_max(vmin, vacc4x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_min(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_min(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_min(vmax, vacc3x0123); vacc4x0123 = wasm_f32x4_min(vmax, vacc4x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_min(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_min(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_min(vmax, vacc3x4567); vacc4x4567 = wasm_f32x4_min(vmax, vacc4x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc4x0123 = wasm_i32x4_max(vacc4x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); vacc4x4567 = wasm_i32x4_max(vacc4x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_gemm_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load(w + 0); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t input_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(input_increment % sizeof(float) == 0); do { const float** i = input; const float* w = weights; size_t p = output_pixels; for (; p >= 8; p -= 8) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset); const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset); const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset); const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset); const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset); const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset); const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset); const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset); i += 2 * 8; const v128_t vw0123p0 = wasm_v128_load(w + 0); const v128_t vw0123p1 = wasm_v128_load(w + 4); const v128_t vw4567p0 = wasm_v128_load(w + 8); const v128_t vw4567p1 = wasm_v128_load(w + 12); w += 2 * 8; const v128_t vtltr0 = wasm_v128_load64_zero(itl0); const v128_t vblbr0 = wasm_v128_load64_zero(ibl0); const v128_t vtltr2 = wasm_v128_load64_zero(itl2); const v128_t vblbr2 = wasm_v128_load64_zero(ibl2); const v128_t vtltr4 = wasm_v128_load64_zero(itl4); const v128_t vblbr4 = wasm_v128_load64_zero(ibl4); const v128_t vtltr6 = wasm_v128_load64_zero(itl6); const v128_t vblbr6 = wasm_v128_load64_zero(ibl6); const v128_t vtltr01 = wasm_v128_load64_lane(itl1, vtltr0, 1); const v128_t vblbr01 = wasm_v128_load64_lane(ibl1, vblbr0, 1); const v128_t vtltr23 = wasm_v128_load64_lane(itl3, vtltr2, 1); const v128_t vblbr23 = wasm_v128_load64_lane(ibl3, vblbr2, 1); const v128_t vtltr45 = wasm_v128_load64_lane(itl5, vtltr4, 1); const v128_t vblbr45 = wasm_v128_load64_lane(ibl5, vblbr4, 1); const v128_t vtltr67 = wasm_v128_load64_lane(itl7, vtltr6, 1); const v128_t vblbr67 = wasm_v128_load64_lane(ibl7, vblbr6, 1); const v128_t valphah0123 = wasm_v32x4_shuffle(vw0123p0, vw0123p1, 0, 2, 4, 6); const v128_t valphav0123 = wasm_v32x4_shuffle(vw0123p0, vw0123p1, 1, 3, 5, 7); const v128_t valphah4567 = wasm_v32x4_shuffle(vw4567p0, vw4567p1, 0, 2, 4, 6); const v128_t valphav4567 = wasm_v32x4_shuffle(vw4567p0, vw4567p1, 1, 3, 5, 7); const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01); const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23); const v128_t vldrd45 = wasm_f32x4_sub(vblbr45, vtltr45); const v128_t vldrd67 = wasm_f32x4_sub(vblbr67, vtltr67); const v128_t vld0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); const v128_t vrd0123 = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); const v128_t vld4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 0, 2, 4, 6); const v128_t vrd4567 = wasm_v32x4_shuffle(vldrd45, vldrd67, 1, 3, 5, 7); const v128_t vtl0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); const v128_t vtr0123 = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); const v128_t vtl4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 0, 2, 4, 6); const v128_t vtr4567 = wasm_v32x4_shuffle(vtltr45, vtltr67, 1, 3, 5, 7); const v128_t vl0123 = wasm_f32x4_add(vtl0123, wasm_f32x4_mul(vld0123, valphav0123)); const v128_t vr0123 = wasm_f32x4_add(vtr0123, wasm_f32x4_mul(vrd0123, valphav0123)); const v128_t vl4567 = wasm_f32x4_add(vtl4567, wasm_f32x4_mul(vld4567, valphav4567)); const v128_t vr4567 = wasm_f32x4_add(vtr4567, wasm_f32x4_mul(vrd4567, valphav4567)); const v128_t vd0123 = wasm_f32x4_sub(vr0123, vl0123); const v128_t vd4567 = wasm_f32x4_sub(vr4567, vl4567); const v128_t vo0123 = wasm_f32x4_add(vl0123, wasm_f32x4_mul(vd0123, valphah0123)); const v128_t vo4567 = wasm_f32x4_add(vl4567, wasm_f32x4_mul(vd4567, valphah4567)); wasm_v128_store(output + 0, vo0123); wasm_v128_store(output + 4, vo4567); output += 8; } for (; p >= 4; p -= 4) { const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset); const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset); const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset); const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset); i += 8; const v128_t vw0 = wasm_v128_load(w); const v128_t vw1 = wasm_v128_load(w + 4); w += 8; const v128_t vtltr0 = wasm_v128_load64_splat(itl0); const v128_t vblbr0 = wasm_v128_load64_splat(ibl0); const v128_t vtltr2 = wasm_v128_load64_splat(itl2); const v128_t vblbr2 = wasm_v128_load64_splat(ibl2); const v128_t vtltr01 = wasm_v128_load64_lane(itl1, vtltr0, 1); const v128_t vblbr01 = wasm_v128_load64_lane(ibl1, vblbr0, 1); const v128_t vtltr23 = wasm_v128_load64_lane(itl3, vtltr2, 1); const v128_t vblbr23 = wasm_v128_load64_lane(ibl3, vblbr2, 1); const v128_t valphah = wasm_v32x4_shuffle(vw0, vw1, 0, 2, 4, 6); const v128_t valphav = wasm_v32x4_shuffle(vw0, vw1, 1, 3, 5, 7); const v128_t vldrd01 = wasm_f32x4_sub(vblbr01, vtltr01); const v128_t vldrd23 = wasm_f32x4_sub(vblbr23, vtltr23); const v128_t vld = wasm_v32x4_shuffle(vldrd01, vldrd23, 0, 2, 4, 6); const v128_t vrd = wasm_v32x4_shuffle(vldrd01, vldrd23, 1, 3, 5, 7); const v128_t vtl = wasm_v32x4_shuffle(vtltr01, vtltr23, 0, 2, 4, 6); const v128_t vtr = wasm_v32x4_shuffle(vtltr01, vtltr23, 1, 3, 5, 7); const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); const v128_t vd = wasm_f32x4_sub(vr, vl); const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); wasm_v128_store(output, vo); output += 4; } if XNN_UNLIKELY(p != 0) { if (p & 2) { const v128_t vw = wasm_v128_load(w); w += 4; const v128_t valphah = wasm_v32x4_shuffle(vw, vw, 0, 2, 0, 2); const v128_t valphav = wasm_v32x4_shuffle(vw, vw, 1, 3, 1, 3); const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset); const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset); const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset); i += 4; const v128_t vtltr = wasm_v128_load64_lane(itl1, wasm_v128_load64_zero(itl0), 1); const v128_t vblbr = wasm_v128_load64_lane(ibl1, wasm_v128_load64_zero(ibl0), 1); const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); const v128_t vld = wasm_v32x4_shuffle(vldrd, vldrd, 0, 2, 0, 2); const v128_t vrd = wasm_v32x4_shuffle(vldrd, vldrd, 1, 3, 1, 3); const v128_t vtl = wasm_v32x4_shuffle(vtltr, vtltr, 0, 2, 0, 2); const v128_t vtr = wasm_v32x4_shuffle(vtltr, vtltr, 1, 3, 1, 3); const v128_t vl = wasm_f32x4_add(vtl, wasm_f32x4_mul(vld, valphav)); const v128_t vr = wasm_f32x4_add(vtr, wasm_f32x4_mul(vrd, valphav)); const v128_t vd = wasm_f32x4_sub(vr, vl); const v128_t vo = wasm_f32x4_add(vl, wasm_f32x4_mul(vd, valphah)); wasm_v128_store64_lane(output, vo, 0); output += 2; } if (p & 1) { // We are computing the following formula: // result = (1 - alpha_h) * (1 - alpha_v) * top_left + // alpha_h * (1 - alpha_v) * top_right + // (1 - alpha_h) * alpha_v * bottom_left + // alpha_h * alpha_v * bottom_right. // // Rearranging gives // result = left + alpha_h * (right - left), // where // left = top_left + alpha_v * (bottom_left - top_left), // right = top_right + alpha_v * (bottom_right - top_right). const float alphah = *w; const v128_t valphav = wasm_v128_load32_splat(w + 1); w += 2; const float* itl = (const float*) ((uintptr_t) i[0] + input_offset); const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset); i += 2; const v128_t vtltr = wasm_v128_load64_zero(itl); const v128_t vblbr = wasm_v128_load64_zero(ibl); // Compute at once // left_diff = bottom_left - top_left // right_diff = bottom_right - top_right const v128_t vldrd = wasm_f32x4_sub(vblbr, vtltr); const v128_t vlr = wasm_f32x4_add(vtltr, wasm_f32x4_mul(vldrd, valphav)); // Extract them and compute the result. const float l = wasm_f32x4_extract_lane(vlr, 0); const float r = wasm_f32x4_extract_lane(vlr, 1); *output++ = l + alphah * (r - l); } } input_offset += input_increment; } while (--channels != 0); } void xnn_f32_ibilinear_ukernel__wasmsimd_c8( size_t output_pixels, size_t channels, const float** restrict input, size_t input_offset, const float* restrict weights, float* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); do { const float* i0 = (const float*) ((uintptr_t) input[0] + input_offset); const float* i1 = (const float*) ((uintptr_t) input[1] + input_offset); const float* i2 = (const float*) ((uintptr_t) input[2] + input_offset); const float* i3 = (const float*) ((uintptr_t) input[3] + input_offset); input += 4; const v128_t valphah = wasm_v128_load32_splat(weights); const v128_t valphav = wasm_v128_load32_splat(weights + 1); weights += 2; size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const v128_t vtl0123 = wasm_v128_load(i0); const v128_t vtr0123 = wasm_v128_load(i1); const v128_t vbl0123 = wasm_v128_load(i2); const v128_t vbr0123 = wasm_v128_load(i3); const v128_t vtl4567 = wasm_v128_load(i0 + 4); const v128_t vtr4567 = wasm_v128_load(i1 + 4); const v128_t vbl4567 = wasm_v128_load(i2 + 4); const v128_t vbr4567 = wasm_v128_load(i3 + 4); i0 += 8; i1 += 8; i2 += 8; i3 += 8; const v128_t vtd0123 = wasm_f32x4_sub(vtr0123, vtl0123); const v128_t vbd0123 = wasm_f32x4_sub(vbr0123, vbl0123); const v128_t vtd4567 = wasm_f32x4_sub(vtr4567, vtl4567); const v128_t vbd4567 = wasm_f32x4_sub(vbr4567, vbl4567); const v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vtd0123, valphah), vtl0123); const v128_t vb0123 = wasm_f32x4_add(wasm_f32x4_mul(vbd0123, valphah), vbl0123); const v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vtd4567, valphah), vtl4567); const v128_t vb4567 = wasm_f32x4_add(wasm_f32x4_mul(vbd4567, valphah), vbl4567); const v128_t vd0123 = wasm_f32x4_sub(vb0123, vt0123); const v128_t vd4567 = wasm_f32x4_sub(vb4567, vt4567); const v128_t vo0123 = wasm_f32x4_add(wasm_f32x4_mul(vd0123, valphav), vt0123); const v128_t vo4567 = wasm_f32x4_add(wasm_f32x4_mul(vd4567, valphav), vt4567); wasm_v128_store(output, vo0123); wasm_v128_store(output + 4, vo4567); output += 8; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const v128_t vtl = wasm_v128_load(i0); const v128_t vtr = wasm_v128_load(i1); const v128_t vbl = wasm_v128_load(i2); const v128_t vbr = wasm_v128_load(i3); i0 += 4; i1 += 4; i2 += 4; i3 += 4; const v128_t vtd = wasm_f32x4_sub(vtr, vtl); const v128_t vbd = wasm_f32x4_sub(vbr, vbl); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vtd, valphah), vtl); const v128_t vb = wasm_f32x4_add(wasm_f32x4_mul(vbd, valphah), vbl); const v128_t vd = wasm_f32x4_sub(vb, vt); const v128_t vo = wasm_f32x4_add(wasm_f32x4_mul(vd, valphav), vt); wasm_v128_store(output, vo); output += 4; } if XNN_UNLIKELY(c != 0) { const v128_t vtl = wasm_v128_load(i0); const v128_t vtr = wasm_v128_load(i1); const v128_t vbl = wasm_v128_load(i2); const v128_t vbr = wasm_v128_load(i3); const v128_t vtd = wasm_f32x4_sub(vtr, vtl); const v128_t vbd = wasm_f32x4_sub(vbr, vbl); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vtd, valphah), vtl); const v128_t vb = wasm_f32x4_add(wasm_f32x4_mul(vbd, valphah), vbl); const v128_t vd = wasm_f32x4_sub(vb, vt); v128_t vo = wasm_f32x4_add(wasm_f32x4_mul(vd, valphav), vt); if (c & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vo, 0); vo = wasm_v64x2_shuffle(vo, vo, 1, 1); output += 2; } if (c & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vo, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } p -= 1 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } p -= 1 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } p -= 1 * sizeof(void*); } while (p != 0); const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } p -= 1 * sizeof(void*); } while (p != 0); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); const v128_t va1 = wasm_v128_load(a1); const v128_t va2 = wasm_v128_load(a2); const v128_t va3 = wasm_v128_load(a3); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } p -= 4 * sizeof(void*); } while (p != 0); const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); vacc01x01 = wasm_f32x4_max(vmin, vacc01x01); vacc23x01 = wasm_f32x4_max(vmin, vacc23x01); vacc01x01 = wasm_f32x4_min(vmax, vacc01x01); vacc23x01 = wasm_f32x4_min(vmax, vacc23x01); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); const v128_t va1 = wasm_v128_load(a1); const v128_t va2 = wasm_v128_load(a2); const v128_t va3 = wasm_v128_load(a3); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } p -= 4 * sizeof(void*); } while (p != 0); const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); vacc01x01 = wasm_f32x4_pmax(vmin, vacc01x01); vacc23x01 = wasm_f32x4_pmax(vmin, vacc23x01); vacc01x01 = wasm_f32x4_pmin(vmax, vacc01x01); vacc23x01 = wasm_f32x4_pmin(vmax, vacc23x01); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); nc = 0; } } while (nc != 0); } void xnn_f32_igemm_ukernel_4x2c4__wasmsimd( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { v128_t vacc0x0c4 = wasm_v128_load32_zero(w); v128_t vacc0x1c4 = wasm_v128_load32_zero(w + 1); v128_t vacc1x0c4 = vacc0x0c4; v128_t vacc1x1c4 = vacc0x1c4; v128_t vacc2x0c4 = vacc0x0c4; v128_t vacc2x1c4 = vacc0x1c4; v128_t vacc3x0c4 = vacc0x0c4; v128_t vacc3x1c4 = vacc0x1c4; w += 2; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb1), vacc3x1c4); } if XNN_UNLIKELY(k != 0) { const v128_t va0 = wasm_v128_load(a0); const v128_t va1 = wasm_v128_load(a1); const v128_t va2 = wasm_v128_load(a2); const v128_t va3 = wasm_v128_load(a3); const v128_t vb0 = wasm_v128_load(w); const v128_t vb1 = wasm_v128_load(w + 4); w += 8; const v128_t vzero = wasm_f32x4_const_splat(0.0f); const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero); const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero); vacc0x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0), vacc0x0c4); vacc0x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1), vacc0x1c4); vacc1x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0), vacc1x0c4); vacc1x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1), vacc1x1c4); vacc2x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0), vacc2x0c4); vacc2x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1), vacc2x1c4); vacc3x0c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0), vacc3x0c4); vacc3x1c4 = wasm_f32x4_add(wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1), vacc3x1c4); } p -= 4 * sizeof(void*); } while (p != 0); const v128_t vacc0x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7)); const v128_t vacc1x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7)); const v128_t vacc2x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7)); const v128_t vacc3x01c2 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7)); v128_t vacc01x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7)); v128_t vacc23x01 = wasm_f32x4_add( wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5), wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7)); if XNN_LIKELY(nc >= 2) { wasm_v128_store64_lane(c3, vacc23x01, 1); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store64_lane(c2, vacc23x01, 0); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store64_lane(c1, vacc01x01, 1); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store64_lane(c0, vacc01x01, 0); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 2; } else { assert(nc == 1); wasm_v128_store32_lane(c3, vacc23x01, 2); wasm_v128_store32_lane(c2, vacc23x01, 0); wasm_v128_store32_lane(c1, vacc01x01, 2); wasm_v128_store32_lane(c0, vacc01x01, 0); nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } p -= 4 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_pmax(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_pmax(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_pmax(vmin, vacc3x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_pmax(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_pmax(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_pmax(vmin, vacc3x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_pmin(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_pmin(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_pmin(vmax, vacc3x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_pmin(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_pmin(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_pmin(vmax, vacc3x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } p -= 4 * sizeof(void*); } while (p != 0); const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } p -= 4 * sizeof(void*); } while (p != 0); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (5 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { c3 = c2; } float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { c4 = c3; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } const float* restrict a4 = a[4]; assert(a4 != NULL); if XNN_UNPREDICTABLE(a4 != zero) { a4 = (const float*) ((uintptr_t) a4 + a_offset); } a += 5; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } p -= 5 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_max(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_max(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_max(vmin, vacc3x0123); vacc4x0123 = wasm_f32x4_max(vmin, vacc4x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_max(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_max(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_max(vmin, vacc3x4567); vacc4x4567 = wasm_f32x4_max(vmin, vacc4x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_min(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_min(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_min(vmax, vacc3x0123); vacc4x0123 = wasm_f32x4_min(vmax, vacc4x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_min(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_min(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_min(vmax, vacc3x4567); vacc4x4567 = wasm_f32x4_min(vmax, vacc4x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (5 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { c3 = c2; } float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } const float* restrict a4 = a[4]; assert(a4 != NULL); if XNN_UNPREDICTABLE(a4 != zero) { a4 = (const float*) ((uintptr_t) a4 + a_offset); } a += 5; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } p -= 5 * sizeof(void*); } while (p != 0); const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc4x0123 = wasm_i32x4_max(vacc4x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); vacc4x4567 = wasm_i32x4_max(vacc4x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_igemm_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, size_t ks, const float** restrict a, const float* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const float* zero, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(ks != 0); assert(ks % (5 * sizeof(void*)) == 0); assert(a_offset % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); float* c0 = c; float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { c3 = c2; } float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc0x4567 = wasm_v128_load(w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w += 8; size_t p = ks; do { const float* restrict a0 = a[0]; assert(a0 != NULL); if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const float*) ((uintptr_t) a0 + a_offset); } const float* restrict a1 = a[1]; assert(a1 != NULL); if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const float*) ((uintptr_t) a1 + a_offset); } const float* restrict a2 = a[2]; assert(a2 != NULL); if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const float*) ((uintptr_t) a2 + a_offset); } const float* restrict a3 = a[3]; assert(a3 != NULL); if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const float*) ((uintptr_t) a3 + a_offset); } const float* restrict a4 = a[4]; assert(a4 != NULL); if XNN_UNPREDICTABLE(a4 != zero) { a4 = (const float*) ((uintptr_t) a4 + a_offset); } a += 5; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_v128_load(w + 0); const v128_t vb4567c0 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_v128_load(w + 8); const v128_t vb4567c1 = wasm_v128_load(w + 12); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_v128_load(w + 16); const v128_t vb4567c2 = wasm_v128_load(w + 20); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_v128_load(w + 24); const v128_t vb4567c3 = wasm_v128_load(w + 28); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w += 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t vb0123 = wasm_v128_load(w); const v128_t vb4567 = wasm_v128_load(w + 4); w += 8; const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } p -= 5 * sizeof(void*); } while (p != 0); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a = (const float**restrict) ((uintptr_t) a - ks); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); do { float* o = output; { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; const float* i8 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vmax018 = wasm_f32x4_max(wasm_f32x4_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_f32x4_max(vi2, vi3); const v128_t vmax45 = wasm_f32x4_max(vi4, vi5); const v128_t vmax67 = wasm_f32x4_max(vi6, vi7); const v128_t vmax2345 = wasm_f32x4_max(vmax23, vmax45); const v128_t vmax01678 = wasm_f32x4_max(vmax018, vmax67); const v128_t vmax = wasm_f32x4_max(vmax2345, vmax01678); const v128_t vout = wasm_f32x4_max(wasm_f32x4_min(vmax, voutput_max), voutput_min); wasm_v128_store(o, vout); o += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vmax018 = wasm_f32x4_max(wasm_f32x4_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_f32x4_max(vi2, vi3); const v128_t vmax45 = wasm_f32x4_max(vi4, vi5); const v128_t vmax67 = wasm_f32x4_max(vi6, vi7); const v128_t vmax2345 = wasm_f32x4_max(vmax23, vmax45); const v128_t vmax01678 = wasm_f32x4_max(vmax018, vmax67); const v128_t vmax = wasm_f32x4_max(vmax2345, vmax01678); v128_t vout = wasm_f32x4_max(wasm_f32x4_min(vmax, voutput_max), voutput_min); if (c & 2) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 2; } if (c & 1) { wasm_v128_store32_lane(o, vout, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_f32x4_max(wasm_f32x4_max(vi0, vi1), vo); const v128_t vmax23 = wasm_f32x4_max(vi2, vi3); const v128_t vmax45 = wasm_f32x4_max(vi4, vi5); const v128_t vmax67 = wasm_f32x4_max(vi6, vi7); const v128_t vmax2345 = wasm_f32x4_max(vmax23, vmax45); const v128_t vmax0167 = wasm_f32x4_max(vmax01, vmax67); const v128_t vmax = wasm_f32x4_max(vmax2345, vmax0167); const v128_t vout = wasm_f32x4_max(wasm_f32x4_min(vmax, voutput_max), voutput_min); wasm_v128_store(o, vout); o += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_f32x4_max(wasm_f32x4_max(vi0, vi1), vo); const v128_t vmax23 = wasm_f32x4_max(vi2, vi3); const v128_t vmax45 = wasm_f32x4_max(vi4, vi5); const v128_t vmax67 = wasm_f32x4_max(vi6, vi7); const v128_t vmax2345 = wasm_f32x4_max(vmax23, vmax45); const v128_t vmax0167 = wasm_f32x4_max(vmax01, vmax67); const v128_t vmax = wasm_f32x4_max(vmax2345, vmax0167); v128_t vout = wasm_f32x4_max(wasm_f32x4_min(vmax, voutput_max), voutput_min); if (c & 2) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 2; } if (c & 1) { wasm_v128_store32_lane(o, vout, 0); o += 1; } } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); do { float* o = output; { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; const float* i8 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); i8 = (const float*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vmax01 = wasm_f32x4_pmax(vi1, vi0); const v128_t vmax23 = wasm_f32x4_pmax(vi3, vi2); const v128_t vmax45 = wasm_f32x4_pmax(vi5, vi4); const v128_t vmax018 = wasm_f32x4_pmax(vi8, vmax01); const v128_t vmax67 = wasm_f32x4_pmax(vi7, vi6); const v128_t vmax2345 = wasm_f32x4_pmax(vmax45, vmax23); const v128_t vmax01678 = wasm_f32x4_pmax(vmax67, vmax018); const v128_t vmax = wasm_f32x4_pmax(vmax2345, vmax01678); v128_t vout = wasm_f32x4_pmax(voutput_min, vmax); vout = wasm_f32x4_pmin(voutput_max, vout); wasm_v128_store(o, vout); o += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vmax01 = wasm_f32x4_pmax(vi1, vi0); const v128_t vmax23 = wasm_f32x4_pmax(vi3, vi2); const v128_t vmax45 = wasm_f32x4_pmax(vi5, vi4); const v128_t vmax018 = wasm_f32x4_pmax(vi8, vmax01); const v128_t vmax67 = wasm_f32x4_pmax(vi7, vi6); const v128_t vmax2345 = wasm_f32x4_pmax(vmax45, vmax23); const v128_t vmax01678 = wasm_f32x4_pmax(vmax67, vmax018); const v128_t vmax = wasm_f32x4_pmax(vmax2345, vmax01678); v128_t vout = wasm_f32x4_pmax(voutput_min, vmax); vout = wasm_f32x4_pmin(voutput_max, vout); if (c & 2) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 2; } if (c & 1) { wasm_v128_store32_lane(o, vout, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const float* i0 = *input++; const float* i1 = *input++; const float* i2 = *input++; const float* i3 = *input++; const float* i4 = *input++; const float* i5 = *input++; const float* i6 = *input++; const float* i7 = *input++; i0 = (const float*) ((uintptr_t) i0 + input_offset); i1 = (const float*) ((uintptr_t) i1 + input_offset); i2 = (const float*) ((uintptr_t) i2 + input_offset); i3 = (const float*) ((uintptr_t) i3 + input_offset); i4 = (const float*) ((uintptr_t) i4 + input_offset); i5 = (const float*) ((uintptr_t) i5 + input_offset); i6 = (const float*) ((uintptr_t) i6 + input_offset); i7 = (const float*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 4; c -= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_f32x4_pmax(vi1, vi0); const v128_t vmax23 = wasm_f32x4_pmax(vi3, vi2); const v128_t vmax45 = wasm_f32x4_pmax(vi5, vi4); const v128_t vmax01o = wasm_f32x4_pmax(vo, vmax01); const v128_t vmax67 = wasm_f32x4_pmax(vi7, vi6); const v128_t vmax2345 = wasm_f32x4_pmax(vmax45, vmax23); const v128_t vmax0167 = wasm_f32x4_pmax(vmax67, vmax01o); const v128_t vmax = wasm_f32x4_pmax(vmax2345, vmax0167); v128_t vout = wasm_f32x4_pmax(voutput_min, vmax); vout = wasm_f32x4_pmin(voutput_max, vout); wasm_v128_store(o, vout); o += 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_f32x4_pmax(vi1, vi0); const v128_t vmax23 = wasm_f32x4_pmax(vi3, vi2); const v128_t vmax45 = wasm_f32x4_pmax(vi5, vi4); const v128_t vmax01o = wasm_f32x4_pmax(vo, vmax01); const v128_t vmax67 = wasm_f32x4_pmax(vi7, vi6); const v128_t vmax2345 = wasm_f32x4_pmax(vmax45, vmax23); const v128_t vmax0167 = wasm_f32x4_pmax(vmax67, vmax01o); const v128_t vmax = wasm_f32x4_pmax(vmax2345, vmax0167); v128_t vout = wasm_f32x4_pmax(voutput_min, vmax); vout = wasm_f32x4_pmin(voutput_max, vout); if (c & 2) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 2; } if (c & 1) { wasm_v128_store32_lane(o, vout, 0); o += 1; } } } input = (const float**) ((uintptr_t) input + input_increment); output = (float*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); wasm_v128_store(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); wasm_v128_store(b, vsum); b += 4; } } { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const v128_t vmultiplier = wasm_v128_load32_splat(multiplier); multiplier += 1; size_t c = channels; float* b = buffer; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* buffer, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements > 9); assert(channels != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const float* i8 = *input++; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); wasm_v128_store(b, vsum); b += 4; } } size_t k = kernel_elements; for (k -= 9; k > 8; k -= 8) { const float* i0 = *input++; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } const float* i1 = *input++; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } const float* i2 = *input++; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } const float* i3 = *input++; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } const float* i4 = *input++; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } const float* i5 = *input++; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } const float* i6 = *input++; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } const float* i7 = *input++; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } float* b = buffer; for (size_t c = 0; c < channels; c += 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); wasm_v128_store(b, vsum); b += 4; } } { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; input = (const float**) ((uintptr_t) input + input_increment); if (k < 2) { i1 = zero; } assert(i1 != NULL); if (k <= 2) { i2 = zero; } assert(i2 != NULL); if (k < 4) { i3 = zero; } assert(i3 != NULL); if (k <= 4) { i4 = zero; } assert(i4 != NULL); if (k < 6) { i5 = zero; } assert(i5 != NULL); if (k <= 6) { i6 = zero; } assert(i6 != NULL); if (k < 8) { i7 = zero; } assert(i7 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } const v128_t vmultiplier = wasm_v128_load32_splat(multiplier); multiplier += 1; size_t c = channels; float* b = buffer; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vacc = wasm_v128_load(b); b += 4; const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vacc = wasm_v128_load(b); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum01a = wasm_f32x4_add(vsum01, vacc); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum0167a = wasm_f32x4_add(vsum01a, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum0167a); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const v128_t vmultiplier = wasm_v128_load32_splat(multiplier); multiplier += 1; size_t c = channels; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum018 = wasm_f32x4_add(wasm_f32x4_add(vi0, vi1), vi8); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_max(vout, vmin); vout = wasm_f32x4_min(vout, vmax); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4( size_t output_pixels, size_t kernel_elements, size_t channels, const float** input, size_t input_offset, const float* zero, const float* multiplier, float* output, size_t input_increment, size_t output_increment, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(kernel_elements <= 9); assert(channels != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { const float* i0 = input[0]; assert(i0 != NULL); const float* i1 = input[1]; const float* i2 = input[2]; const float* i3 = input[3]; const float* i4 = input[4]; const float* i5 = input[5]; const float* i6 = input[6]; const float* i7 = input[7]; const float* i8 = input[8]; input = (const float**) ((uintptr_t) input + input_increment); if (kernel_elements < 2) { i1 = zero; } assert(i1 != NULL); if (kernel_elements <= 2) { i2 = zero; } assert(i2 != NULL); if (kernel_elements < 4) { i3 = zero; } assert(i3 != NULL); if (kernel_elements <= 4) { i4 = zero; } assert(i4 != NULL); if (kernel_elements < 6) { i5 = zero; } assert(i5 != NULL); if (kernel_elements <= 6) { i6 = zero; } assert(i6 != NULL); if (kernel_elements < 8) { i7 = zero; } assert(i7 != NULL); if (kernel_elements <= 8) { i8 = zero; } assert(i8 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const float*) ((uintptr_t) i0 + input_offset); } if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const float*) ((uintptr_t) i1 + input_offset); } if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const float*) ((uintptr_t) i2 + input_offset); } if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const float*) ((uintptr_t) i3 + input_offset); } if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const float*) ((uintptr_t) i4 + input_offset); } if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const float*) ((uintptr_t) i5 + input_offset); } if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const float*) ((uintptr_t) i6 + input_offset); } if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const float*) ((uintptr_t) i7 + input_offset); } if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const float*) ((uintptr_t) i8 + input_offset); } const v128_t vmultiplier = wasm_v128_load32_splat(multiplier); multiplier += 1; size_t c = channels; while (c >= 4) { const v128_t vi0 = wasm_v128_load(i0); i0 += 4; const v128_t vi1 = wasm_v128_load(i1); i1 += 4; const v128_t vi2 = wasm_v128_load(i2); i2 += 4; const v128_t vi3 = wasm_v128_load(i3); i3 += 4; const v128_t vi4 = wasm_v128_load(i4); i4 += 4; const v128_t vi5 = wasm_v128_load(i5); i5 += 4; const v128_t vi6 = wasm_v128_load(i6); i6 += 4; const v128_t vi7 = wasm_v128_load(i7); i7 += 4; const v128_t vi8 = wasm_v128_load(i8); i8 += 4; const v128_t vsum018 = wasm_f32x4_add(wasm_f32x4_add(vi0, vi1), vi8); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); wasm_v128_store(output, vout); output += 4; c -= 4; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); const v128_t vsum67 = wasm_f32x4_add(vi6, vi7); const v128_t vsum018 = wasm_f32x4_add(vsum01, vi8); const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); const v128_t vsum01678 = wasm_f32x4_add(vsum018, vsum67); const v128_t vsum = wasm_f32x4_add(vsum2345, vsum01678); v128_t vout = wasm_f32x4_mul(vsum, vmultiplier); vout = wasm_f32x4_pmax(vmin, vout); vout = wasm_f32x4_pmin(vmax, vout); if (c & 2) { wasm_v128_store64_lane(output, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); output += 2; } if (c & 1) { wasm_v128_store32_lane(output, vout, 0); output += 1; } } output = (float*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_f32_prelu_ukernel__wasmsimd_iminmax_2x8( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; const v128_t vzero = wasm_i32x4_const_splat(0); do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const v128_t vw0123 = wasm_v128_load(w); const v128_t vw4567 = wasm_v128_load(w + 4); w += 8; v128_t vi0x0123 = wasm_v128_load(i0); v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; v128_t vi1x0123 = wasm_v128_load(i1); v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); v128_t vacc0x4567 = wasm_i32x4_max(vi0x4567, vzero); vi0x4567 = wasm_i32x4_min(vi0x4567, vzero); v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); v128_t vacc1x4567 = wasm_i32x4_max(vi1x4567, vzero); vi1x4567 = wasm_i32x4_min(vi1x4567, vzero); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi0x4567, vw4567), vacc0x4567); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(vi1x4567, vw4567), vacc1x4567); wasm_v128_store(o0, vacc0x0123); wasm_v128_store(o0 + 4, vacc0x4567); o0 += 8; wasm_v128_store(o1, vacc1x0123); wasm_v128_store(o1 + 4, vacc1x4567); o1 += 8; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const v128_t vw0123 = wasm_v128_load(w); w += 4; v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); wasm_v128_store(o0, vacc0x0123); o0 += 4; wasm_v128_store(o1, vacc1x0123); o1 += 4; } if XNN_UNLIKELY(c != 0) { const v128_t vw0123 = wasm_v128_load(w); w = (const float*) ((uintptr_t) w + c); v128_t vi0x0123 = wasm_v128_load(i0); i0 = (const float*) ((uintptr_t) i0 + c); v128_t vi1x0123 = wasm_v128_load(i1); i1 = (const float*) ((uintptr_t) i1 + c); v128_t vacc0x0123 = wasm_i32x4_max(vi0x0123, vzero); vi0x0123 = wasm_i32x4_min(vi0x0123, vzero); v128_t vacc1x0123 = wasm_i32x4_max(vi1x0123, vzero); vi1x0123 = wasm_i32x4_min(vi1x0123, vzero); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi0x0123, vw0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vi1x0123, vw0123), vacc1x0123); if (c & (2 * sizeof(float))) { wasm_v128_store64_lane(o0, vacc0x0123, 0); wasm_v128_store64_lane(o1, vacc1x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); o0 += 2; o1 += 2; } if (c & (1 * sizeof(float))) { wasm_v128_store32_lane(o0, vacc0x0123, 0); wasm_v128_store32_lane(o1, vacc1x0123, 0); o0 += 1; o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_prelu_ukernel__wasmsimd_laneselect_2x8( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) { const v128_t vw0123 = wasm_v128_load(w); const v128_t vw4567 = wasm_v128_load(w + 4); w += 8; const v128_t vi0x0123 = wasm_v128_load(i0); const v128_t vi0x4567 = wasm_v128_load(i0 + 4); i0 += 8; const v128_t vi1x0123 = wasm_v128_load(i1); const v128_t vi1x4567 = wasm_v128_load(i1 + 4); i1 += 8; v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); v128_t vacc0x4567 = wasm_f32x4_mul(vi0x4567, vw4567); const v128_t vmask0x4567 = wasm_i32x4_shr(vi0x4567, 31); v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); v128_t vacc1x4567 = wasm_f32x4_mul(vi1x4567, vw4567); const v128_t vmask1x4567 = wasm_i32x4_shr(vi1x4567, 31); vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vi0x4567, vmask0x4567); vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vi1x4567, vmask1x4567); wasm_v128_store(o0, vacc0x0123); wasm_v128_store(o0 + 4, vacc0x4567); o0 += 8; wasm_v128_store(o1, vacc1x0123); wasm_v128_store(o1 + 4, vacc1x4567); o1 += 8; } for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const v128_t vw0123 = wasm_v128_load(w); w += 4; const v128_t vi0x0123 = wasm_v128_load(i0); i0 += 4; const v128_t vi1x0123 = wasm_v128_load(i1); i1 += 4; v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); wasm_v128_store(o0, vacc0x0123); o0 += 4; wasm_v128_store(o1, vacc1x0123); o1 += 4; } if XNN_UNLIKELY(c != 0) { const v128_t vw0123 = wasm_v128_load(w); w = (const float*) ((uintptr_t) w + c); const v128_t vi0x0123 = wasm_v128_load(i0); i0 = (const float*) ((uintptr_t) i0 + c); const v128_t vi1x0123 = wasm_v128_load(i1); i1 = (const float*) ((uintptr_t) i1 + c); v128_t vacc0x0123 = wasm_f32x4_mul(vi0x0123, vw0123); const v128_t vmask0x0123 = wasm_i32x4_shr(vi0x0123, 31); v128_t vacc1x0123 = wasm_f32x4_mul(vi1x0123, vw0123); const v128_t vmask1x0123 = wasm_i32x4_shr(vi1x0123, 31); vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vi0x0123, vmask0x0123); vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vi1x0123, vmask1x0123); if (c & (2 * sizeof(float))) { wasm_v128_store64_lane(o0, vacc0x0123, 0); wasm_v128_store64_lane(o1, vacc1x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); o0 += 2; o1 += 2; } if (c & (1 * sizeof(float))) { wasm_v128_store32_lane(o0, vacc0x0123, 0); wasm_v128_store32_lane(o1, vacc1x0123, 0); o0 += 1; o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); w = (const float*) w + 8; vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); w = (const float*) w + 8; vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_relu_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); w = (const float*) w + 8; const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_ukernel_1x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); w = (const float*) w + 8; if XNN_LIKELY(nc >= 8) { wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c0, vacc0x0123); vacc0x0123 = vacc0x4567; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); w = (const float*) w + 8; vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_pmax(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_pmax(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_pmax(vmin, vacc3x0123); vacc0x4567 = wasm_f32x4_pmax(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_pmax(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_pmax(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_pmax(vmin, vacc3x4567); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_pmin(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_pmin(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_pmin(vmax, vacc3x0123); vacc0x4567 = wasm_f32x4_pmin(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_pmin(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_pmin(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_pmin(vmax, vacc3x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_relu_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); w = (const float*) w + 8; const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_ukernel_4x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); w = (const float*) w + 8; if XNN_LIKELY(nc >= 8) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); vacc4x0123 = wasm_f32x4_mul(vacc4x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); vacc4x4567 = wasm_f32x4_mul(vacc4x4567, vscale4567); w = (const float*) w + 8; vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_max(vmin, vacc1x0123); vacc2x0123 = wasm_f32x4_max(vmin, vacc2x0123); vacc3x0123 = wasm_f32x4_max(vmin, vacc3x0123); vacc4x0123 = wasm_f32x4_max(vmin, vacc4x0123); vacc0x4567 = wasm_f32x4_max(vmin, vacc0x4567); vacc1x4567 = wasm_f32x4_max(vmin, vacc1x4567); vacc2x4567 = wasm_f32x4_max(vmin, vacc2x4567); vacc3x4567 = wasm_f32x4_max(vmin, vacc3x4567); vacc4x4567 = wasm_f32x4_max(vmin, vacc4x4567); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_min(vmax, vacc1x0123); vacc2x0123 = wasm_f32x4_min(vmax, vacc2x0123); vacc3x0123 = wasm_f32x4_min(vmax, vacc3x0123); vacc4x0123 = wasm_f32x4_min(vmax, vacc4x0123); vacc0x4567 = wasm_f32x4_min(vmax, vacc0x4567); vacc1x4567 = wasm_f32x4_min(vmax, vacc1x4567); vacc2x4567 = wasm_f32x4_min(vmax, vacc2x4567); vacc3x4567 = wasm_f32x4_min(vmax, vacc3x4567); vacc4x4567 = wasm_f32x4_min(vmax, vacc4x4567); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_relu_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); vacc4x0123 = wasm_f32x4_mul(vacc4x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); vacc4x4567 = wasm_f32x4_mul(vacc4x4567, vscale4567); w = (const float*) w + 8; const v128_t vzero = wasm_i32x4_const_splat(0); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vzero); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vzero); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vzero); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vzero); vacc4x0123 = wasm_i32x4_max(vacc4x0123, vzero); vacc0x4567 = wasm_i32x4_max(vacc0x4567, vzero); vacc1x4567 = wasm_i32x4_max(vacc1x4567, vzero); vacc2x4567 = wasm_i32x4_max(vacc2x4567, vzero); vacc3x4567 = wasm_i32x4_max(vacc3x4567, vzero); vacc4x4567 = wasm_i32x4_max(vacc4x4567, vzero); if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qc8w_gemm_ukernel_5x8__wasmsimd_splat( size_t mr, size_t nc, size_t kc, const float* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mr != 0); assert(mr <= 5); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(float) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const float* a0 = a; float* c0 = c; const float* a1 = (const float*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const float* a2 = (const float*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const float* a3 = (const float*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr < 4) { a3 = a2; c3 = c2; } const float* a4 = (const float*) ((uintptr_t) a3 + a_stride); float* c4 = (float*) ((uintptr_t) c3 + cm_stride); if XNN_UNPREDICTABLE(mr <= 4) { a4 = a3; c4 = c3; } do { v128_t vacc0x0123 = wasm_v128_load((const float*) w + 0); v128_t vacc0x4567 = wasm_v128_load((const float*) w + 4); v128_t vacc1x0123 = vacc0x0123; v128_t vacc1x4567 = vacc0x4567; v128_t vacc2x0123 = vacc0x0123; v128_t vacc2x4567 = vacc0x4567; v128_t vacc3x0123 = vacc0x0123; v128_t vacc3x4567 = vacc0x4567; v128_t vacc4x0123 = vacc0x0123; v128_t vacc4x4567 = vacc0x4567; w = (const float*) w + 8; size_t k = kc; while (k >= 4 * sizeof(float)) { const v128_t va0 = wasm_v128_load(a0); a0 += 4; const v128_t va1 = wasm_v128_load(a1); a1 += 4; const v128_t va2 = wasm_v128_load(a2); a2 += 4; const v128_t va3 = wasm_v128_load(a3); a3 += 4; const v128_t va4 = wasm_v128_load(a4); a4 += 4; const v128_t va0c0 = wasm_v32x4_shuffle(va0, va0, 0, 0, 0, 0); const v128_t va1c0 = wasm_v32x4_shuffle(va1, va1, 0, 0, 0, 0); const v128_t va2c0 = wasm_v32x4_shuffle(va2, va2, 0, 0, 0, 0); const v128_t va3c0 = wasm_v32x4_shuffle(va3, va3, 0, 0, 0, 0); const v128_t va4c0 = wasm_v32x4_shuffle(va4, va4, 0, 0, 0, 0); const v128_t vb0123c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 0)))); const v128_t vb4567c0 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb0123c0), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb0123c0), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb0123c0), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb0123c0), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb0123c0), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c0, vb4567c0), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c0, vb4567c0), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c0, vb4567c0), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c0, vb4567c0), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c0, vb4567c0), vacc4x4567); const v128_t va0c1 = wasm_v32x4_shuffle(va0, va0, 1, 1, 1, 1); const v128_t va1c1 = wasm_v32x4_shuffle(va1, va1, 1, 1, 1, 1); const v128_t va2c1 = wasm_v32x4_shuffle(va2, va2, 1, 1, 1, 1); const v128_t va3c1 = wasm_v32x4_shuffle(va3, va3, 1, 1, 1, 1); const v128_t va4c1 = wasm_v32x4_shuffle(va4, va4, 1, 1, 1, 1); const v128_t vb0123c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 8)))); const v128_t vb4567c1 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 12)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb0123c1), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb0123c1), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb0123c1), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb0123c1), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb0123c1), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c1, vb4567c1), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c1, vb4567c1), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c1, vb4567c1), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c1, vb4567c1), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c1, vb4567c1), vacc4x4567); const v128_t va0c2 = wasm_v32x4_shuffle(va0, va0, 2, 2, 2, 2); const v128_t va1c2 = wasm_v32x4_shuffle(va1, va1, 2, 2, 2, 2); const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); const v128_t va3c2 = wasm_v32x4_shuffle(va3, va3, 2, 2, 2, 2); const v128_t va4c2 = wasm_v32x4_shuffle(va4, va4, 2, 2, 2, 2); const v128_t vb0123c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 16)))); const v128_t vb4567c2 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 20)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb0123c2), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb0123c2), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb0123c2), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb0123c2), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb0123c2), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c2, vb4567c2), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c2, vb4567c2), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c2, vb4567c2), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c2, vb4567c2), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c2, vb4567c2), vacc4x4567); const v128_t va0c3 = wasm_v32x4_shuffle(va0, va0, 3, 3, 3, 3); const v128_t va1c3 = wasm_v32x4_shuffle(va1, va1, 3, 3, 3, 3); const v128_t va2c3 = wasm_v32x4_shuffle(va2, va2, 3, 3, 3, 3); const v128_t va3c3 = wasm_v32x4_shuffle(va3, va3, 3, 3, 3, 3); const v128_t va4c3 = wasm_v32x4_shuffle(va4, va4, 3, 3, 3, 3); const v128_t vb0123c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 24)))); const v128_t vb4567c3 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 28)))); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb0123c3), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb0123c3), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb0123c3), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb0123c3), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb0123c3), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0c3, vb4567c3), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1c3, vb4567c3), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2c3, vb4567c3), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3c3, vb4567c3), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4c3, vb4567c3), vacc4x4567); w = (const int8_t*) w + 32; k -= 4 * sizeof(float); } if XNN_UNLIKELY(k != 0) { do { const v128_t va0 = wasm_v128_load32_splat(a0); a0 += 1; const v128_t va1 = wasm_v128_load32_splat(a1); a1 += 1; const v128_t va2 = wasm_v128_load32_splat(a2); a2 += 1; const v128_t va3 = wasm_v128_load32_splat(a3); a3 += 1; const v128_t va4 = wasm_v128_load32_splat(a4); a4 += 1; const v128_t vb0123 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w)))); const v128_t vb4567 = wasm_f32x4_convert_i32x4(wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(wasm_v128_load32_splat((const int8_t*) w + 4)))); w = (const int8_t*) w + 8; vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb0123), vacc0x0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb0123), vacc1x0123); vacc2x0123 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb0123), vacc2x0123); vacc3x0123 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb0123), vacc3x0123); vacc4x0123 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb0123), vacc4x0123); vacc0x4567 = wasm_f32x4_add(wasm_f32x4_mul(va0, vb4567), vacc0x4567); vacc1x4567 = wasm_f32x4_add(wasm_f32x4_mul(va1, vb4567), vacc1x4567); vacc2x4567 = wasm_f32x4_add(wasm_f32x4_mul(va2, vb4567), vacc2x4567); vacc3x4567 = wasm_f32x4_add(wasm_f32x4_mul(va3, vb4567), vacc3x4567); vacc4x4567 = wasm_f32x4_add(wasm_f32x4_mul(va4, vb4567), vacc4x4567); k -= sizeof(float); } while (k != 0); } const v128_t vscale0123 = wasm_v128_load((const float*) w + 0); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); vacc4x0123 = wasm_f32x4_mul(vacc4x0123, vscale0123); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); vacc4x4567 = wasm_f32x4_mul(vacc4x4567, vscale4567); w = (const float*) w + 8; if XNN_LIKELY(nc >= 8) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c4 + 4, vacc4x4567); c4 = (float*) ((uintptr_t) c4 + cn_stride); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c3 + 4, vacc3x4567); c3 = (float*) ((uintptr_t) c3 + cn_stride); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c2 + 4, vacc2x4567); c2 = (float*) ((uintptr_t) c2 + cn_stride); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c1 + 4, vacc1x4567); c1 = (float*) ((uintptr_t) c1 + cn_stride); wasm_v128_store(c0, vacc0x0123); wasm_v128_store(c0 + 4, vacc0x4567); c0 = (float*) ((uintptr_t) c0 + cn_stride); a4 = (const float*) ((uintptr_t) a4 - kc); a3 = (const float*) ((uintptr_t) a3 - kc); a2 = (const float*) ((uintptr_t) a2 - kc); a1 = (const float*) ((uintptr_t) a1 - kc); a0 = (const float*) ((uintptr_t) a0 - kc); nc -= 8; } else { if (nc & 4) { wasm_v128_store(c4, vacc4x0123); wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); vacc4x0123 = vacc4x4567; vacc3x0123 = vacc3x4567; vacc2x0123 = vacc2x4567; vacc1x0123 = vacc1x4567; vacc0x0123 = vacc0x4567; c4 += 4; c3 += 4; c2 += 4; c1 += 4; c0 += 4; } if (nc & 2) { wasm_v128_store64_lane(c4, vacc4x0123, 0); wasm_v128_store64_lane(c3, vacc3x0123, 0); wasm_v128_store64_lane(c2, vacc2x0123, 0); wasm_v128_store64_lane(c1, vacc1x0123, 0); wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc4x0123 = wasm_v64x2_shuffle(vacc4x0123, vacc4x0123, 1, 1); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c4 += 2; c3 += 2; c2 += 2; c1 += 2; c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c4, vacc4x0123, 0); wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32( size_t batch, const float* input, int8_t* output, const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vscale = wasm_v128_load64_splat(params->wasmsimd_magic.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_magic.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->wasmsimd_magic.magic_min); const v128_t vmagic_bias_less_zero_point = wasm_v128_load64_splat(params->wasmsimd_magic.magic_bias_less_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd_magic.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); v128_t vx89AB = wasm_v128_load(input + 8); v128_t vxCDEF = wasm_v128_load(input + 12); v128_t vxGHIJ = wasm_v128_load(input + 16); v128_t vxKLMN = wasm_v128_load(input + 20); v128_t vxOPQR = wasm_v128_load(input + 24); v128_t vxSTUV = wasm_v128_load(input + 28); input += 32; vx0123 = wasm_f32x4_mul(vx0123, vscale); vx4567 = wasm_f32x4_mul(vx4567, vscale); vx89AB = wasm_f32x4_mul(vx89AB, vscale); vxCDEF = wasm_f32x4_mul(vxCDEF, vscale); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vscale); vxKLMN = wasm_f32x4_mul(vxKLMN, vscale); vxOPQR = wasm_f32x4_mul(vxOPQR, vscale); vxSTUV = wasm_f32x4_mul(vxSTUV, vscale); vx0123 = wasm_f32x4_add(vx0123, vmagic_bias); vx4567 = wasm_f32x4_add(vx4567, vmagic_bias); vx89AB = wasm_f32x4_add(vx89AB, vmagic_bias); vxCDEF = wasm_f32x4_add(vxCDEF, vmagic_bias); vxGHIJ = wasm_f32x4_add(vxGHIJ, vmagic_bias); vxKLMN = wasm_f32x4_add(vxKLMN, vmagic_bias); vxOPQR = wasm_f32x4_add(vxOPQR, vmagic_bias); vxSTUV = wasm_f32x4_add(vxSTUV, vmagic_bias); v128_t vacc0123 = wasm_i32x4_max(vx0123, vmagic_min); v128_t vacc4567 = wasm_i32x4_max(vx4567, vmagic_min); v128_t vacc89AB = wasm_i32x4_max(vx89AB, vmagic_min); v128_t vaccCDEF = wasm_i32x4_max(vxCDEF, vmagic_min); v128_t vaccGHIJ = wasm_i32x4_max(vxGHIJ, vmagic_min); v128_t vaccKLMN = wasm_i32x4_max(vxKLMN, vmagic_min); v128_t vaccOPQR = wasm_i32x4_max(vxOPQR, vmagic_min); v128_t vaccSTUV = wasm_i32x4_max(vxSTUV, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_zero_point); vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_zero_point); vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_zero_point); vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_zero_point); vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_zero_point); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); const v128_t vacc89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); const v128_t vaccGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); const v128_t vaccOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); v128_t vy0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV); vy0123456789ABCDEF = wasm_i8x16_min(vy0123456789ABCDEF, voutput_max); vyGHIJKLMNOPQRSTUV = wasm_i8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV); output += 32; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vx_lo = wasm_v128_load(input); v128_t vx_hi = wasm_v128_load(input + 4); input += 8; vx_lo = wasm_f32x4_mul(vx_lo, vscale); vx_hi = wasm_f32x4_mul(vx_hi, vscale); vx_lo = wasm_f32x4_add(vx_lo, vmagic_bias); vx_hi = wasm_f32x4_add(vx_hi, vmagic_bias); v128_t vacc_lo = wasm_i32x4_max(vx_lo, vmagic_min); v128_t vacc_hi = wasm_i32x4_max(vx_hi, vmagic_min); vacc_lo = wasm_i32x4_sub(vacc_lo, vmagic_bias_less_zero_point); vacc_hi = wasm_i32x4_sub(vacc_hi, vmagic_bias_less_zero_point); const v128_t vacc = wasm_i16x8_narrow_i32x4(vacc_lo, vacc_hi); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); vy = wasm_i8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); v128_t vx_lo = wasm_v128_load(input); const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float)))); v128_t vx_hi = wasm_v128_load(x_hi); vx_lo = wasm_f32x4_mul(vx_lo, vscale); vx_hi = wasm_f32x4_mul(vx_hi, vscale); vx_lo = wasm_f32x4_add(vx_lo, vmagic_bias); vx_hi = wasm_f32x4_add(vx_hi, vmagic_bias); v128_t vacc_lo = wasm_i32x4_max(vx_lo, vmagic_min); v128_t vacc_hi = wasm_i32x4_max(vx_hi, vmagic_min); vacc_lo = wasm_i32x4_sub(vacc_lo, vmagic_bias_less_zero_point); vacc_hi = wasm_i32x4_sub(vacc_hi, vmagic_bias_less_zero_point); const v128_t vacc = wasm_i16x8_narrow_i32x4(vacc_lo, vacc_hi); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); vy = wasm_i8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(float))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32( size_t batch, const float* input, uint8_t* output, const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vscale = wasm_v128_load64_splat(params->wasmsimd_magic.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_magic.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->wasmsimd_magic.magic_min); const v128_t vmagic_bias_less_zero_point = wasm_v128_load64_splat(params->wasmsimd_magic.magic_bias_less_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd_magic.output_max); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); v128_t vx89AB = wasm_v128_load(input + 8); v128_t vxCDEF = wasm_v128_load(input + 12); v128_t vxGHIJ = wasm_v128_load(input + 16); v128_t vxKLMN = wasm_v128_load(input + 20); v128_t vxOPQR = wasm_v128_load(input + 24); v128_t vxSTUV = wasm_v128_load(input + 28); input += 32; vx0123 = wasm_f32x4_mul(vx0123, vscale); vx4567 = wasm_f32x4_mul(vx4567, vscale); vx89AB = wasm_f32x4_mul(vx89AB, vscale); vxCDEF = wasm_f32x4_mul(vxCDEF, vscale); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vscale); vxKLMN = wasm_f32x4_mul(vxKLMN, vscale); vxOPQR = wasm_f32x4_mul(vxOPQR, vscale); vxSTUV = wasm_f32x4_mul(vxSTUV, vscale); vx0123 = wasm_f32x4_add(vx0123, vmagic_bias); vx4567 = wasm_f32x4_add(vx4567, vmagic_bias); vx89AB = wasm_f32x4_add(vx89AB, vmagic_bias); vxCDEF = wasm_f32x4_add(vxCDEF, vmagic_bias); vxGHIJ = wasm_f32x4_add(vxGHIJ, vmagic_bias); vxKLMN = wasm_f32x4_add(vxKLMN, vmagic_bias); vxOPQR = wasm_f32x4_add(vxOPQR, vmagic_bias); vxSTUV = wasm_f32x4_add(vxSTUV, vmagic_bias); v128_t vacc0123 = wasm_i32x4_max(vx0123, vmagic_min); v128_t vacc4567 = wasm_i32x4_max(vx4567, vmagic_min); v128_t vacc89AB = wasm_i32x4_max(vx89AB, vmagic_min); v128_t vaccCDEF = wasm_i32x4_max(vxCDEF, vmagic_min); v128_t vaccGHIJ = wasm_i32x4_max(vxGHIJ, vmagic_min); v128_t vaccKLMN = wasm_i32x4_max(vxKLMN, vmagic_min); v128_t vaccOPQR = wasm_i32x4_max(vxOPQR, vmagic_min); v128_t vaccSTUV = wasm_i32x4_max(vxSTUV, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_zero_point); vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_zero_point); vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_zero_point); vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_zero_point); vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_zero_point); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); const v128_t vacc89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); const v128_t vaccGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); const v128_t vaccOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); v128_t vy0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vacc01234567, vacc89ABCDEF); v128_t vyGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(vaccGHIJKLMN, vaccOPQRSTUV); vy0123456789ABCDEF = wasm_u8x16_min(vy0123456789ABCDEF, voutput_max); vyGHIJKLMNOPQRSTUV = wasm_u8x16_min(vyGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vy0123456789ABCDEF); wasm_v128_store(output + 16, vyGHIJKLMNOPQRSTUV); output += 32; } for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vx_lo = wasm_v128_load(input); v128_t vx_hi = wasm_v128_load(input + 4); input += 8; vx_lo = wasm_f32x4_mul(vx_lo, vscale); vx_hi = wasm_f32x4_mul(vx_hi, vscale); vx_lo = wasm_f32x4_add(vx_lo, vmagic_bias); vx_hi = wasm_f32x4_add(vx_hi, vmagic_bias); v128_t vacc_lo = wasm_i32x4_max(vx_lo, vmagic_min); v128_t vacc_hi = wasm_i32x4_max(vx_hi, vmagic_min); vacc_lo = wasm_i32x4_sub(vacc_lo, vmagic_bias_less_zero_point); vacc_hi = wasm_i32x4_sub(vacc_hi, vmagic_bias_less_zero_point); const v128_t vacc = wasm_i16x8_narrow_i32x4(vacc_lo, vacc_hi); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); vy = wasm_u8x16_min(vy, voutput_max); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); v128_t vx_lo = wasm_v128_load(input); const float* x_hi = (const float*) ((uintptr_t) input + (batch & (4 * sizeof(float)))); v128_t vx_hi = wasm_v128_load(x_hi); vx_lo = wasm_f32x4_mul(vx_lo, vscale); vx_hi = wasm_f32x4_mul(vx_hi, vscale); vx_lo = wasm_f32x4_add(vx_lo, vmagic_bias); vx_hi = wasm_f32x4_add(vx_hi, vmagic_bias); v128_t vacc_lo = wasm_i32x4_max(vx_lo, vmagic_min); v128_t vacc_hi = wasm_i32x4_max(vx_hi, vmagic_min); vacc_lo = wasm_i32x4_sub(vacc_lo, vmagic_bias_less_zero_point); vacc_hi = wasm_i32x4_sub(vacc_hi, vmagic_bias_less_zero_point); const v128_t vacc = wasm_i16x8_narrow_i32x4(vacc_lo, vacc_hi); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); vy = wasm_u8x16_min(vy, voutput_max); if (batch & (4 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(float))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2( size_t batch, const float* input, const float* max, float* output, float* sum, const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(max != NULL); assert(output != NULL); assert(sum != NULL); const v128_t vi_max = wasm_v128_load32_splat(max); const v128_t vlog2e = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.log2e); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.magic_bias); const v128_t vminus_ln2_hi = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.minus_ln2_hi); const v128_t vminus_ln2_lo = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.minus_ln2_lo); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c2); const v128_t vc1 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c1); const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.denorm_cutoff); v128_t vacc0 = wasm_f32x4_const_splat(0.0f); v128_t vacc1 = vacc0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { // Load 16 (4x4) inputs at a time. const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); const v128_t vi89AB = wasm_v128_load(input + 8); const v128_t viCDEF = wasm_v128_load(input + 12); input += 16; const v128_t vx0123 = wasm_f32x4_sub(vi0123, vi_max); const v128_t vx4567 = wasm_f32x4_sub(vi4567, vi_max); const v128_t vx89AB = wasm_f32x4_sub(vi89AB, vi_max); const v128_t vxCDEF = wasm_f32x4_sub(viCDEF, vi_max); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vlog2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vlog2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vx89AB, vlog2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vxCDEF, vlog2e), vmagic_bias); const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vx0123); v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vx4567); v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vx89AB); v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vxCDEF); vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt0123), vc4); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt4567), vc4); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt89AB), vc4); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc5, vtCDEF), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc1); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc1); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc1); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc1); vt0123 = wasm_f32x4_mul(vt0123, vs0123); vt4567 = wasm_f32x4_mul(vt4567, vs4567); vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); v128_t vf0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); v128_t vf4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); v128_t vf89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); v128_t vfCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_lt(vx0123, vdenorm_cutoff)); vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_lt(vx4567, vdenorm_cutoff)); vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_lt(vx89AB, vdenorm_cutoff)); vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_lt(vxCDEF, vdenorm_cutoff)); wasm_v128_store(output, vf0123); wasm_v128_store(output + 4, vf4567); wasm_v128_store(output + 8, vf89AB); wasm_v128_store(output + 12, vfCDEF); output += 16; vacc0 = wasm_f32x4_add(vacc0, vf0123); vacc0 = wasm_f32x4_add(vacc0, vf4567); vacc0 = wasm_f32x4_add(vacc0, vf89AB); vacc0 = wasm_f32x4_add(vacc0, vfCDEF); } // Add up all accumulators to vacc0 vacc0 = wasm_f32x4_add(vacc0, vacc1); v128_t vacc = vacc0; for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vi = wasm_v128_load(input); input += 4; const v128_t vx = wasm_f32x4_sub(vi, vi_max); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); vt = wasm_f32x4_mul(vt, vs); v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); wasm_v128_store(output, vf); output += 4; vacc = wasm_f32x4_add(vacc, vf); } vacc = wasm_f32x4_add(vacc, wasm_v64x2_shuffle(vacc, vacc, 1, 1)); float vsum = wasm_f32x4_extract_lane(vacc, 0) + wasm_f32x4_extract_lane(vacc, 1); if (batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 3 * sizeof(float)); const v128_t vi = wasm_v128_load(input); const v128_t vx = wasm_f32x4_sub(vi, vi_max); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vx, vlog2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vx); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc5, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc1); vt = wasm_f32x4_mul(vt, vs); v128_t vf = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); vf = wasm_v128_andnot(vf, wasm_f32x4_lt(vx, vdenorm_cutoff)); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vf, 0); output += 2; vsum += wasm_f32x4_extract_lane(vf, 0) + wasm_f32x4_extract_lane(vf, 1); vf = wasm_v64x2_shuffle(vf, vf, 1, 1); } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vf, 0); vsum += wasm_f32x4_extract_lane(vf, 0); } } *sum = vsum; } void xnn_f32_rmax_ukernel__wasmsimd_arm( size_t batch, const float* input, float* output) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); v128_t vmax0 = wasm_v128_load32_splat(input); v128_t vmax1 = vmax0; v128_t vmax2 = vmax0; v128_t vmax3 = vmax0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vx0 = wasm_v128_load(input); const v128_t vx1 = wasm_v128_load(input + 4); const v128_t vx2 = wasm_v128_load(input + 8); const v128_t vx3 = wasm_v128_load(input + 12); input += 16; vmax0 = wasm_f32x4_max(vmax0, vx0); vmax1 = wasm_f32x4_max(vmax1, vx1); vmax2 = wasm_f32x4_max(vmax2, vx2); vmax3 = wasm_f32x4_max(vmax3, vx3); } v128_t vmax0123 = wasm_f32x4_max(wasm_f32x4_max(vmax0, vmax1), wasm_f32x4_max(vmax2, vmax3)); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); vmax0123 = wasm_f32x4_max(vmax0123, vx); input += 4; } vmax0123 = wasm_f32x4_max(vmax0123, wasm_v32x4_shuffle(vmax0123, vmax0123, 2, 3, 0, 1)); float vmax = __builtin_wasm_max_f32(wasm_f32x4_extract_lane(vmax0123, 0), wasm_f32x4_extract_lane(vmax0123, 1)); if XNN_UNLIKELY(batch != 0) { do { const float vx = *input++; vmax = __builtin_wasm_max_f32(vx, vmax); batch -= sizeof(float); } while (batch != 0); } *output = vmax; } void xnn_f32_rmax_ukernel__wasmsimd_x86( size_t batch, const float* input, float* output) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); v128_t vmax0 = wasm_v128_load32_splat(input); v128_t vmax1 = vmax0; v128_t vmax2 = vmax0; v128_t vmax3 = vmax0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vx0 = wasm_v128_load(input); const v128_t vx1 = wasm_v128_load(input + 4); const v128_t vx2 = wasm_v128_load(input + 8); const v128_t vx3 = wasm_v128_load(input + 12); input += 16; vmax0 = wasm_f32x4_pmax(vx0, vmax0); vmax1 = wasm_f32x4_pmax(vx1, vmax1); vmax2 = wasm_f32x4_pmax(vx2, vmax2); vmax3 = wasm_f32x4_pmax(vx3, vmax3); } const v128_t vmax01 = wasm_f32x4_pmax(vmax1, vmax0); const v128_t vmax23 = wasm_f32x4_pmax(vmax3, vmax2); v128_t vmax0123 = wasm_f32x4_pmax(vmax23, vmax01); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); vmax0123 = wasm_f32x4_pmax(vx, vmax0123); input += 4; } const v128_t vmax2301 = wasm_v32x4_shuffle(vmax0123, vmax0123, 2, 3, 0, 1); vmax0123 = wasm_f32x4_pmax(vmax2301, vmax0123); float vmax = math_max_f32(wasm_f32x4_extract_lane(vmax0123, 0), wasm_f32x4_extract_lane(vmax0123, 1)); if XNN_UNLIKELY(batch != 0) { do { const float vx = *input++; vmax = math_max_f32(vx, vmax); batch -= sizeof(float); } while (batch != 0); } *output = vmax; } void xnn_f32_rminmax_ukernel__wasmsimd_minmax_x16_acc4( size_t batch, const float* input, float* output, const union xnn_f32_default_params* params) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); v128_t vmin0 = wasm_v128_load32_splat(input); v128_t vmax0 = vmin0; v128_t vmin1 = vmin0; v128_t vmax1 = vmax0; v128_t vmin2 = vmin0; v128_t vmax2 = vmax0; v128_t vmin3 = vmin0; v128_t vmax3 = vmax0; for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vt0 = wasm_v128_load(input); const v128_t vt1 = wasm_v128_load(input + 4); const v128_t vt2 = wasm_v128_load(input + 8); const v128_t vt3 = wasm_v128_load(input + 12); input += 16; vmin0 = wasm_f32x4_min(vmin0, vt0); vmax0 = wasm_f32x4_max(vmax0, vt0); vmin1 = wasm_f32x4_min(vmin1, vt1); vmax1 = wasm_f32x4_max(vmax1, vt1); vmin2 = wasm_f32x4_min(vmin2, vt2); vmax2 = wasm_f32x4_max(vmax2, vt2); vmin3 = wasm_f32x4_min(vmin3, vt3); vmax3 = wasm_f32x4_max(vmax3, vt3); } vmin0 = wasm_f32x4_min(vmin0, vmin1); vmax0 = wasm_f32x4_max(vmax0, vmax1); vmin2 = wasm_f32x4_min(vmin2, vmin3); vmax2 = wasm_f32x4_max(vmax2, vmax3); vmin0 = wasm_f32x4_min(vmin0, vmin2); vmax0 = wasm_f32x4_max(vmax0, vmax2); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vt = wasm_v128_load(input); input += 4; vmin0 = wasm_f32x4_min(vmin0, vt); vmax0 = wasm_f32x4_max(vmax0, vt); } vmin0 = wasm_f32x4_min(vmin0, wasm_v64x2_shuffle(vmin0, vmin0, 1, 1)); vmax0 = wasm_f32x4_max(vmax0, wasm_v64x2_shuffle(vmax0, vmax0, 1, 1)); if XNN_UNLIKELY(batch & (2 * sizeof(float))) { const v128_t vt = wasm_v128_load64_zero(input); input += 2; vmin0 = wasm_f32x4_min(vmin0, vt); vmax0 = wasm_f32x4_max(vmax0, vt); } vmin0 = wasm_f32x4_min(vmin0, wasm_v32x4_shuffle(vmin0, vmin0, 1, 1, 1, 1)); vmax0 = wasm_f32x4_max(vmax0, wasm_v32x4_shuffle(vmax0, vmax0, 1, 1, 1, 1)); if XNN_UNLIKELY(batch & (1 * sizeof(float))) { const v128_t vt = wasm_v128_load32_zero(input); vmin0 = wasm_f32x4_min(vmin0, vt); vmax0 = wasm_f32x4_max(vmax0, vt); } wasm_v128_store32_lane(output, vmin0, 0); wasm_v128_store32_lane(output + 1, vmax0, 0); } void xnn_f32_rsum_ukernel__wasmsimd_x16_acc4( size_t batch, const float* input, float* output, const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); v128_t vacc0 = wasm_f32x4_const_splat(0.0f); v128_t vacc1 = wasm_f32x4_const_splat(0.0f); v128_t vacc2 = wasm_f32x4_const_splat(0.0f); v128_t vacc3 = wasm_f32x4_const_splat(0.0f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vt0 = wasm_v128_load(input); const v128_t vt1 = wasm_v128_load(input + 4); const v128_t vt2 = wasm_v128_load(input + 8); const v128_t vt3 = wasm_v128_load(input + 12); input += 16; vacc0 = wasm_f32x4_add(vacc0, vt0); vacc1 = wasm_f32x4_add(vacc1, vt1); vacc2 = wasm_f32x4_add(vacc2, vt2); vacc3 = wasm_f32x4_add(vacc3, vt3); } vacc0 = wasm_f32x4_add(vacc0, vacc1); vacc2 = wasm_f32x4_add(vacc2, vacc3); vacc0 = wasm_f32x4_add(vacc0, vacc2); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vt = wasm_v128_load(input); input += 4; vacc0 = wasm_f32x4_add(vacc0, vt); } vacc0 = wasm_f32x4_add(vacc0, wasm_v64x2_shuffle(vacc0, vacc0, 1, 1)); if XNN_UNLIKELY(batch & (2 * sizeof(float))) { const v128_t vt = wasm_v128_load64_zero(input); input += 2; vacc0 = wasm_f32x4_add(vacc0, vt); } vacc0 = wasm_f32x4_add(vacc0, wasm_v32x4_shuffle(vacc0, vacc0, 1, 1, 1, 1)); if XNN_UNLIKELY(batch & (1 * sizeof(float))) { const v128_t vt = wasm_v128_load32_zero(input); vacc0 = wasm_f32x4_add(vacc0, vt); } const v128_t vscale = wasm_v128_load32_zero(¶ms->scalar.scale); vacc0 = wasm_f32x4_mul(vacc0, vscale); wasm_v128_store32_lane(output, vacc0, 0); } void xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm( size_t mc, size_t nc, const float* input, const float* weights, const int32_t* widx_dmap, const uint32_t* nidx_nnzmap, float* output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mc != 0); assert(mc % sizeof(float) == 0); assert(nc != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); size_t output_decrement = output_stride * nc - 32 * sizeof(float); while XNN_LIKELY(mc >= 32 * sizeof(float)) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; v128_t vacc89AB = vacc0123; v128_t vaccCDEF = vacc0123; v128_t vaccGHIJ = vacc0123; v128_t vaccKLMN = vacc0123; v128_t vaccOPQR = vacc0123; v128_t vaccSTUV = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); const v128_t vi89AB = wasm_v128_load(input + 8); const v128_t viCDEF = wasm_v128_load(input + 12); const v128_t viGHIJ = wasm_v128_load(input + 16); const v128_t viKLMN = wasm_v128_load(input + 20); const v128_t viOPQR = wasm_v128_load(input + 24); const v128_t viSTUV = wasm_v128_load(input + 28); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw)); vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw)); vaccGHIJ = wasm_f32x4_add(vaccGHIJ, wasm_f32x4_mul(viGHIJ, vw)); vaccKLMN = wasm_f32x4_add(vaccKLMN, wasm_f32x4_mul(viKLMN, vw)); vaccOPQR = wasm_f32x4_add(vaccOPQR, wasm_f32x4_mul(viOPQR, vw)); vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_min(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_min(vmax, vacc4567); v128_t vout89AB = wasm_f32x4_min(vmax, vacc89AB); v128_t voutCDEF = wasm_f32x4_min(vmax, vaccCDEF); v128_t voutGHIJ = wasm_f32x4_min(vmax, vaccGHIJ); v128_t voutKLMN = wasm_f32x4_min(vmax, vaccKLMN); v128_t voutOPQR = wasm_f32x4_min(vmax, vaccOPQR); v128_t voutSTUV = wasm_f32x4_min(vmax, vaccSTUV); vout0123 = wasm_f32x4_max(vmin, vout0123); vout4567 = wasm_f32x4_max(vmin, vout4567); vout89AB = wasm_f32x4_max(vmin, vout89AB); voutCDEF = wasm_f32x4_max(vmin, voutCDEF); voutGHIJ = wasm_f32x4_max(vmin, voutGHIJ); voutKLMN = wasm_f32x4_max(vmin, voutKLMN); voutOPQR = wasm_f32x4_max(vmin, voutOPQR); voutSTUV = wasm_f32x4_max(vmin, voutSTUV); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); wasm_v128_store(output + 8, vout89AB); wasm_v128_store(output + 12, voutCDEF); wasm_v128_store(output + 16, voutGHIJ); wasm_v128_store(output + 20, voutKLMN); wasm_v128_store(output + 24, voutOPQR); wasm_v128_store(output + 28, voutSTUV); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 32; mc -= 32 * sizeof(float); } if XNN_UNLIKELY(mc != 0) { output_decrement += 16 * sizeof(float); if (mc & (16 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; v128_t vacc89AB = vacc0123; v128_t vaccCDEF = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); const v128_t vi89AB = wasm_v128_load(input + 8); const v128_t viCDEF = wasm_v128_load(input + 12); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw)); vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_min(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_min(vmax, vacc4567); v128_t vout89AB = wasm_f32x4_min(vmax, vacc89AB); v128_t voutCDEF = wasm_f32x4_min(vmax, vaccCDEF); vout0123 = wasm_f32x4_max(vmin, vout0123); vout4567 = wasm_f32x4_max(vmin, vout4567); vout89AB = wasm_f32x4_max(vmin, vout89AB); voutCDEF = wasm_f32x4_max(vmin, voutCDEF); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); wasm_v128_store(output + 8, vout89AB); wasm_v128_store(output + 12, voutCDEF); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 16; } output_decrement += 8 * sizeof(float); if (mc & (8 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_min(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_min(vmax, vacc4567); vout0123 = wasm_f32x4_max(vmin, vout0123); vout4567 = wasm_f32x4_max(vmin, vout4567); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 8; } output_decrement += 4 * sizeof(float); if (mc & (4 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_min(vmax, vacc0123); vout0123 = wasm_f32x4_max(vmin, vout0123); wasm_v128_store(output, vout0123); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 4; } output_decrement += 2 * sizeof(float); if (mc & (2 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc01 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi01 = wasm_v128_load64_splat(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw)); } while (--nnz != 0); } v128_t vout01 = wasm_f32x4_min(vmax, vacc01); vout01 = wasm_f32x4_max(vmin, vout01); wasm_v128_store64_lane(output, vout01, 0); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 2; } output_decrement += 1 * sizeof(float); if (mc & (1 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0 = wasm_v128_load32_splat(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw)); } while (--nnz != 0); } v128_t vout0 = wasm_f32x4_min(vmax, vacc0); vout0 = wasm_f32x4_max(vmin, vout0); wasm_v128_store32_lane(output, vout0, 0); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 1; } } } void xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86( size_t mc, size_t nc, const float* input, const float* weights, const int32_t* widx_dmap, const uint32_t* nidx_nnzmap, float* output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { assert(mc != 0); assert(mc % sizeof(float) == 0); assert(nc != 0); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); size_t output_decrement = output_stride * nc - 32 * sizeof(float); while XNN_LIKELY(mc >= 32 * sizeof(float)) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; v128_t vacc89AB = vacc0123; v128_t vaccCDEF = vacc0123; v128_t vaccGHIJ = vacc0123; v128_t vaccKLMN = vacc0123; v128_t vaccOPQR = vacc0123; v128_t vaccSTUV = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); const v128_t vi89AB = wasm_v128_load(input + 8); const v128_t viCDEF = wasm_v128_load(input + 12); const v128_t viGHIJ = wasm_v128_load(input + 16); const v128_t viKLMN = wasm_v128_load(input + 20); const v128_t viOPQR = wasm_v128_load(input + 24); const v128_t viSTUV = wasm_v128_load(input + 28); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw)); vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw)); vaccGHIJ = wasm_f32x4_add(vaccGHIJ, wasm_f32x4_mul(viGHIJ, vw)); vaccKLMN = wasm_f32x4_add(vaccKLMN, wasm_f32x4_mul(viKLMN, vw)); vaccOPQR = wasm_f32x4_add(vaccOPQR, wasm_f32x4_mul(viOPQR, vw)); vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567); v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB); v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF); v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ); v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN); v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR); v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV); vout0123 = wasm_f32x4_pmax(vmin, vout0123); vout4567 = wasm_f32x4_pmax(vmin, vout4567); vout89AB = wasm_f32x4_pmax(vmin, vout89AB); voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF); voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ); voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN); voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR); voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); wasm_v128_store(output + 8, vout89AB); wasm_v128_store(output + 12, voutCDEF); wasm_v128_store(output + 16, voutGHIJ); wasm_v128_store(output + 20, voutKLMN); wasm_v128_store(output + 24, voutOPQR); wasm_v128_store(output + 28, voutSTUV); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 32; mc -= 32 * sizeof(float); } if XNN_UNLIKELY(mc != 0) { output_decrement += 16 * sizeof(float); if (mc & (16 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; v128_t vacc89AB = vacc0123; v128_t vaccCDEF = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); const v128_t vi89AB = wasm_v128_load(input + 8); const v128_t viCDEF = wasm_v128_load(input + 12); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw)); vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567); v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB); v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF); vout0123 = wasm_f32x4_pmax(vmin, vout0123); vout4567 = wasm_f32x4_pmax(vmin, vout4567); vout89AB = wasm_f32x4_pmax(vmin, vout89AB); voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); wasm_v128_store(output + 8, vout89AB); wasm_v128_store(output + 12, voutCDEF); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 16; } output_decrement += 8 * sizeof(float); if (mc & (8 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; v128_t vacc4567 = vacc0123; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); const v128_t vi4567 = wasm_v128_load(input + 4); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123); v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567); vout0123 = wasm_f32x4_pmax(vmin, vout0123); vout4567 = wasm_f32x4_pmax(vmin, vout4567); wasm_v128_store(output, vout0123); wasm_v128_store(output + 4, vout4567); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 8; } output_decrement += 4 * sizeof(float); if (mc & (4 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0123 = wasm_v128_load(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); } while (--nnz != 0); } v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123); vout0123 = wasm_f32x4_pmax(vmin, vout0123); wasm_v128_store(output, vout0123); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 4; } output_decrement += 2 * sizeof(float); if (mc & (2 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc01 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi01 = wasm_v128_load64_splat(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw)); } while (--nnz != 0); } v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01); vout01 = wasm_f32x4_pmax(vmin, vout01); wasm_v128_store64_lane(output, vout01, 0); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 2; } output_decrement += 1 * sizeof(float); if (mc & (1 * sizeof(float))) { const float* w = weights; const int32_t* dmap = widx_dmap; const uint32_t* nnzmap = nidx_nnzmap; size_t n = nc; do { uint32_t nnz = *nnzmap++; v128_t vacc0 = wasm_v128_load32_splat(w); w += 1; if XNN_LIKELY(nnz != 0) { do { const intptr_t diff = *dmap++; const v128_t vi0 = wasm_v128_load32_splat(input); input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); const v128_t vw = wasm_v128_load32_splat(w); w += 1; vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw)); } while (--nnz != 0); } v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0); vout0 = wasm_f32x4_pmax(vmin, vout0); wasm_v128_store32_lane(output, vout0, 0); output = (float*restrict) ((uintptr_t) output + output_stride); } while (--n != 0); output = (float*restrict) ((uintptr_t) output - output_decrement); input += 1; } } } void xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_add(va0, vb0); v128_t vacc1 = wasm_f32x4_add(va1, vb1); v128_t vacc2 = wasm_f32x4_add(va2, vb2); v128_t vacc3 = wasm_f32x4_add(va3, vb3); vacc0 = wasm_f32x4_max(vacc0, voutput_min); vacc1 = wasm_f32x4_max(vacc1, voutput_min); vacc2 = wasm_f32x4_max(vacc2, voutput_min); vacc3 = wasm_f32x4_max(vacc3, voutput_min); vacc0 = wasm_f32x4_min(vacc0, voutput_max); vacc1 = wasm_f32x4_min(vacc1, voutput_max); vacc2 = wasm_f32x4_min(vacc2, voutput_max); vacc3 = wasm_f32x4_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_add(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_add(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_add(va0, vb0); v128_t vacc1 = wasm_f32x4_add(va1, vb1); v128_t vacc2 = wasm_f32x4_add(va2, vb2); v128_t vacc3 = wasm_f32x4_add(va3, vb3); vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_add(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_add(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vadd_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_add(va0, vb0); v128_t vacc1 = wasm_f32x4_add(va1, vb1); v128_t vacc2 = wasm_f32x4_add(va2, vb2); v128_t vacc3 = wasm_f32x4_add(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_add(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_add(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_add(va0, vb); v128_t vy1 = wasm_f32x4_add(va1, vb); v128_t vy2 = wasm_f32x4_add(va2, vb); v128_t vy3 = wasm_f32x4_add(va3, vb); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_add(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_add(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_add(va0, vb); v128_t vy1 = wasm_f32x4_add(va1, vb); v128_t vy2 = wasm_f32x4_add(va2, vb); v128_t vy3 = wasm_f32x4_add(va3, vb); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_add(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_add(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vaddc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_add(va0, vb); v128_t vy1 = wasm_f32x4_add(va1, vb); v128_t vy2 = wasm_f32x4_add(va2, vb); v128_t vy3 = wasm_f32x4_add(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_add(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_add(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_div(va0, vb0); v128_t vacc1 = wasm_f32x4_div(va1, vb1); v128_t vacc2 = wasm_f32x4_div(va2, vb2); v128_t vacc3 = wasm_f32x4_div(va3, vb3); vacc0 = wasm_f32x4_max(vacc0, voutput_min); vacc1 = wasm_f32x4_max(vacc1, voutput_min); vacc2 = wasm_f32x4_max(vacc2, voutput_min); vacc3 = wasm_f32x4_max(vacc3, voutput_min); vacc0 = wasm_f32x4_min(vacc0, voutput_max); vacc1 = wasm_f32x4_min(vacc1, voutput_max); vacc2 = wasm_f32x4_min(vacc2, voutput_max); vacc3 = wasm_f32x4_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_div(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_div(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_div(va0, vb0); v128_t vacc1 = wasm_f32x4_div(va1, vb1); v128_t vacc2 = wasm_f32x4_div(va2, vb2); v128_t vacc3 = wasm_f32x4_div(va3, vb3); vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_div(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_div(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vdiv_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_div(va0, vb0); v128_t vacc1 = wasm_f32x4_div(va1, vb1); v128_t vacc2 = wasm_f32x4_div(va2, vb2); v128_t vacc3 = wasm_f32x4_div(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_div(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_div(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(va0, vb); v128_t vy1 = wasm_f32x4_div(va1, vb); v128_t vy2 = wasm_f32x4_div(va2, vb); v128_t vy3 = wasm_f32x4_div(va3, vb); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(va0, vb); v128_t vy1 = wasm_f32x4_div(va1, vb); v128_t vy2 = wasm_f32x4_div(va2, vb); v128_t vy3 = wasm_f32x4_div(va3, vb); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vdivc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(va0, vb); v128_t vy1 = wasm_f32x4_div(va1, vb); v128_t vy2 = wasm_f32x4_div(va2, vb); v128_t vy3 = wasm_f32x4_div(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_max(va0, vb0); v128_t vacc1 = wasm_f32x4_max(va1, vb1); v128_t vacc2 = wasm_f32x4_max(va2, vb2); v128_t vacc3 = wasm_f32x4_max(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_max(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_max(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_pmax(va0, vb0); v128_t vacc1 = wasm_f32x4_pmax(va1, vb1); v128_t vacc2 = wasm_f32x4_pmax(va2, vb2); v128_t vacc3 = wasm_f32x4_pmax(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_pmax(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_pmax(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_max(va0, vb); v128_t vy1 = wasm_f32x4_max(va1, vb); v128_t vy2 = wasm_f32x4_max(va2, vb); v128_t vy3 = wasm_f32x4_max(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_max(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_max(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_pmax(vb, va0); v128_t vy1 = wasm_f32x4_pmax(vb, va1); v128_t vy2 = wasm_f32x4_pmax(vb, va2); v128_t vy3 = wasm_f32x4_pmax(vb, va3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_pmax(vb, va); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_pmax(vb, va); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmin_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_min(va0, vb0); v128_t vacc1 = wasm_f32x4_min(va1, vb1); v128_t vacc2 = wasm_f32x4_min(va2, vb2); v128_t vacc3 = wasm_f32x4_min(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_min(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_min(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmin_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_pmin(va0, vb0); v128_t vacc1 = wasm_f32x4_pmin(va1, vb1); v128_t vacc2 = wasm_f32x4_pmin(va2, vb2); v128_t vacc3 = wasm_f32x4_pmin(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_pmin(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_pmin(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vminc_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_min(va0, vb); v128_t vy1 = wasm_f32x4_min(va1, vb); v128_t vy2 = wasm_f32x4_min(va2, vb); v128_t vy3 = wasm_f32x4_min(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_min(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_min(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vminc_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_pmin(vb, va0); v128_t vy1 = wasm_f32x4_pmin(vb, va1); v128_t vy2 = wasm_f32x4_pmin(vb, va2); v128_t vy3 = wasm_f32x4_pmin(vb, va3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_pmin(vb, va); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_pmin(vb, va); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_mul(va0, vb0); v128_t vacc1 = wasm_f32x4_mul(va1, vb1); v128_t vacc2 = wasm_f32x4_mul(va2, vb2); v128_t vacc3 = wasm_f32x4_mul(va3, vb3); vacc0 = wasm_f32x4_max(vacc0, voutput_min); vacc1 = wasm_f32x4_max(vacc1, voutput_min); vacc2 = wasm_f32x4_max(vacc2, voutput_min); vacc3 = wasm_f32x4_max(vacc3, voutput_min); vacc0 = wasm_f32x4_min(vacc0, voutput_max); vacc1 = wasm_f32x4_min(vacc1, voutput_max); vacc2 = wasm_f32x4_min(vacc2, voutput_max); vacc3 = wasm_f32x4_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_mul(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_mul(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_mul(va0, vb0); v128_t vacc1 = wasm_f32x4_mul(va1, vb1); v128_t vacc2 = wasm_f32x4_mul(va2, vb2); v128_t vacc3 = wasm_f32x4_mul(va3, vb3); vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_mul(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_mul(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmul_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_mul(va0, vb0); v128_t vacc1 = wasm_f32x4_mul(va1, vb1); v128_t vacc2 = wasm_f32x4_mul(va2, vb2); v128_t vacc3 = wasm_f32x4_mul(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_mul(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_mul(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_mul(va0, vb); v128_t vy1 = wasm_f32x4_mul(va1, vb); v128_t vy2 = wasm_f32x4_mul(va2, vb); v128_t vy3 = wasm_f32x4_mul(va3, vb); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_mul(va0, vb); v128_t vy1 = wasm_f32x4_mul(va1, vb); v128_t vy2 = wasm_f32x4_mul(va2, vb); v128_t vy3 = wasm_f32x4_mul(va3, vb); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_mul(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vmulc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_mul(va0, vb); v128_t vy1 = wasm_f32x4_mul(va1, vb); v128_t vy2 = wasm_f32x4_mul(va2, vb); v128_t vy3 = wasm_f32x4_mul(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_mul(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_mul(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(vb, va0); v128_t vy1 = wasm_f32x4_div(vb, va1); v128_t vy2 = wasm_f32x4_div(vb, va2); v128_t vy3 = wasm_f32x4_div(vb, va3); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(vb, va); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(vb, va); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(vb, va0); v128_t vy1 = wasm_f32x4_div(vb, va1); v128_t vy2 = wasm_f32x4_div(vb, va2); v128_t vy3 = wasm_f32x4_div(vb, va3); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(vb, va); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(vb, va); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrdivc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_div(vb, va0); v128_t vy1 = wasm_f32x4_div(vb, va1); v128_t vy2 = wasm_f32x4_div(vb, va2); v128_t vy3 = wasm_f32x4_div(vb, va3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_div(vb, va); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_div(vb, va); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(vb, va0); v128_t vy1 = wasm_f32x4_sub(vb, va1); v128_t vy2 = wasm_f32x4_sub(vb, va2); v128_t vy3 = wasm_f32x4_sub(vb, va3); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(vb, va); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(vb, va); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(vb, va0); v128_t vy1 = wasm_f32x4_sub(vb, va1); v128_t vy2 = wasm_f32x4_sub(vb, va2); v128_t vy3 = wasm_f32x4_sub(vb, va3); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(vb, va); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(vb, va); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrsubc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(vb, va0); v128_t vy1 = wasm_f32x4_sub(vb, va1); v128_t vy2 = wasm_f32x4_sub(vb, va2); v128_t vy3 = wasm_f32x4_sub(vb, va3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(vb, va); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(vb, va); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsqrdiff_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_sub(va0, vb0); v128_t vacc1 = wasm_f32x4_sub(va1, vb1); v128_t vacc2 = wasm_f32x4_sub(va2, vb2); v128_t vacc3 = wasm_f32x4_sub(va3, vb3); vacc0 = wasm_f32x4_mul(vacc0, vacc0); vacc1 = wasm_f32x4_mul(vacc1, vacc1); vacc2 = wasm_f32x4_mul(vacc2, vacc2); vacc3 = wasm_f32x4_mul(vacc3, vacc3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_mul(vacc, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(va0, vb); v128_t vy1 = wasm_f32x4_sub(va1, vb); v128_t vy2 = wasm_f32x4_sub(va2, vb); v128_t vy3 = wasm_f32x4_sub(va3, vb); vy0 = wasm_f32x4_mul(vy0, vy0); vy1 = wasm_f32x4_mul(vy1, vy1); vy2 = wasm_f32x4_mul(vy2, vy2); vy3 = wasm_f32x4_mul(vy3, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_mul(vy, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_sub(va0, vb0); v128_t vacc1 = wasm_f32x4_sub(va1, vb1); v128_t vacc2 = wasm_f32x4_sub(va2, vb2); v128_t vacc3 = wasm_f32x4_sub(va3, vb3); vacc0 = wasm_f32x4_max(vacc0, voutput_min); vacc1 = wasm_f32x4_max(vacc1, voutput_min); vacc2 = wasm_f32x4_max(vacc2, voutput_min); vacc3 = wasm_f32x4_max(vacc3, voutput_min); vacc0 = wasm_f32x4_min(vacc0, voutput_max); vacc1 = wasm_f32x4_min(vacc1, voutput_max); vacc2 = wasm_f32x4_min(vacc2, voutput_max); vacc3 = wasm_f32x4_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_max(vacc, voutput_min); vacc = wasm_f32x4_min(vacc, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_sub(va0, vb0); v128_t vacc1 = wasm_f32x4_sub(va1, vb1); v128_t vacc2 = wasm_f32x4_sub(va2, vb2); v128_t vacc3 = wasm_f32x4_sub(va3, vb3); vacc0 = wasm_f32x4_pmax(voutput_min, vacc0); vacc1 = wasm_f32x4_pmax(voutput_min, vacc1); vacc2 = wasm_f32x4_pmax(voutput_min, vacc2); vacc3 = wasm_f32x4_pmax(voutput_min, vacc3); vacc0 = wasm_f32x4_pmin(voutput_max, vacc0); vacc1 = wasm_f32x4_pmin(voutput_max, vacc1); vacc2 = wasm_f32x4_pmin(voutput_max, vacc2); vacc3 = wasm_f32x4_pmin(voutput_max, vacc3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_sub(va, vb); vacc = wasm_f32x4_pmax(voutput_min, vacc); vacc = wasm_f32x4_pmin(voutput_max, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vsub_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; const v128_t vb0 = wasm_v128_load(input_b); const v128_t vb1 = wasm_v128_load(input_b + 4); const v128_t vb2 = wasm_v128_load(input_b + 8); const v128_t vb3 = wasm_v128_load(input_b + 12); input_b += 16; v128_t vacc0 = wasm_f32x4_sub(va0, vb0); v128_t vacc1 = wasm_f32x4_sub(va1, vb1); v128_t vacc2 = wasm_f32x4_sub(va2, vb2); v128_t vacc3 = wasm_f32x4_sub(va3, vb3); wasm_v128_store(output, vacc0); wasm_v128_store(output + 4, vacc1); wasm_v128_store(output + 8, vacc2); wasm_v128_store(output + 12, vacc3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; const v128_t vb = wasm_v128_load(input_b); input_b += 4; v128_t vacc = wasm_f32x4_sub(va, vb); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); const v128_t vb = wasm_v128_load(input_b); v128_t vacc = wasm_f32x4_sub(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(va0, vb); v128_t vy1 = wasm_f32x4_sub(va1, vb); v128_t vy2 = wasm_f32x4_sub(va2, vb); v128_t vy3 = wasm_f32x4_sub(va3, vb); vy0 = wasm_f32x4_max(vy0, voutput_min); vy1 = wasm_f32x4_max(vy1, voutput_min); vy2 = wasm_f32x4_max(vy2, voutput_min); vy3 = wasm_f32x4_max(vy3, voutput_min); vy0 = wasm_f32x4_min(vy0, voutput_max); vy1 = wasm_f32x4_min(vy1, voutput_max); vy2 = wasm_f32x4_min(vy2, voutput_max); vy3 = wasm_f32x4_min(vy3, voutput_max); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_max(vy, voutput_min); vy = wasm_f32x4_min(vy, voutput_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(va0, vb); v128_t vy1 = wasm_f32x4_sub(va1, vb); v128_t vy2 = wasm_f32x4_sub(va2, vb); v128_t vy3 = wasm_f32x4_sub(va3, vb); vy0 = wasm_f32x4_pmax(voutput_min, vy0); vy1 = wasm_f32x4_pmax(voutput_min, vy1); vy2 = wasm_f32x4_pmax(voutput_min, vy2); vy3 = wasm_f32x4_pmax(voutput_min, vy3); vy0 = wasm_f32x4_pmin(voutput_max, vy0); vy1 = wasm_f32x4_pmin(voutput_max, vy1); vy2 = wasm_f32x4_pmin(voutput_max, vy2); vy3 = wasm_f32x4_pmin(voutput_max, vy3); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(va, vb); vy = wasm_f32x4_pmax(voutput_min, vy); vy = wasm_f32x4_pmin(voutput_max, vy); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsubc_ukernel__wasmsimd_x16( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vb = wasm_v128_load32_splat(input_b); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t va0 = wasm_v128_load(input_a); const v128_t va1 = wasm_v128_load(input_a + 4); const v128_t va2 = wasm_v128_load(input_a + 8); const v128_t va3 = wasm_v128_load(input_a + 12); input_a += 16; v128_t vy0 = wasm_f32x4_sub(va0, vb); v128_t vy1 = wasm_f32x4_sub(va1, vb); v128_t vy2 = wasm_f32x4_sub(va2, vb); v128_t vy3 = wasm_f32x4_sub(va3, vb); wasm_v128_store(output, vy0); wasm_v128_store(output + 4, vy1); wasm_v128_store(output + 8, vy2); wasm_v128_store(output + 12, vy3); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t va = wasm_v128_load(input_a); input_a += 4; v128_t vy = wasm_f32x4_sub(va, vb); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t va = wasm_v128_load(input_a); v128_t vy = wasm_f32x4_sub(va, vb); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vclamp_ukernel__wasmsimd_arm_x8( size_t batch, const float* input, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vy_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vy_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vacc0123 = wasm_v128_load(input); v128_t vacc4567 = wasm_v128_load(input + 4); input += 8; vacc0123 = wasm_f32x4_max(vacc0123, vy_min); vacc4567 = wasm_f32x4_max(vacc4567, vy_min); vacc0123 = wasm_f32x4_min(vacc0123, vy_max); vacc4567 = wasm_f32x4_min(vacc4567, vy_max); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vacc = wasm_v128_load(input); input += 4; vacc = wasm_f32x4_max(vacc, vy_min); vacc = wasm_f32x4_min(vacc, vy_max); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vacc = wasm_v128_load(input); vacc = wasm_f32x4_max(vacc, vy_min); vacc = wasm_f32x4_min(vacc, vy_max); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vclamp_ukernel__wasmsimd_x86_x8( size_t batch, const float* input, float* output, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vy_min = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vy_max = wasm_v128_load64_splat(params->wasmsimd.max); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vacc0123 = wasm_v128_load(input); v128_t vacc4567 = wasm_v128_load(input + 4); input += 8; vacc0123 = wasm_f32x4_pmax(vy_min, vacc0123); vacc4567 = wasm_f32x4_pmax(vy_min, vacc4567); vacc0123 = wasm_f32x4_pmin(vy_max, vacc0123); vacc4567 = wasm_f32x4_pmin(vy_max, vacc4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vacc = wasm_v128_load(input); input += 4; vacc = wasm_f32x4_pmax(vy_min, vacc); vacc = wasm_f32x4_pmin(vy_max, vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vacc = wasm_v128_load(input); vacc = wasm_f32x4_pmax(vy_min, vacc); vacc = wasm_f32x4_pmin(vy_max, vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vcmul_ukernel__wasmsimd_x8( size_t batch, const float* input_a, const float* input_b, float* output, const union xnn_f32_default_params* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const float* ar = input_a; const float* ai = (const float*) ((uintptr_t) input_a + batch); const float* br = input_b; const float* bi = (const float*) ((uintptr_t) input_b + batch); float* or = output; float* oi = (float*) ((uintptr_t) output + batch); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t va0r = wasm_v128_load(ar); const v128_t va0i = wasm_v128_load(ai); const v128_t vb0r = wasm_v128_load(br); const v128_t vb0i = wasm_v128_load(bi); const v128_t va1r = wasm_v128_load(ar + 4); const v128_t va1i = wasm_v128_load(ai + 4); const v128_t vb1r = wasm_v128_load(br + 4); const v128_t vb1i = wasm_v128_load(bi + 4); ar += 8; ai += 8; br += 8; bi += 8; v128_t vacc0r = wasm_f32x4_mul(va0r, vb0r); v128_t vacc0i = wasm_f32x4_mul(va0r, vb0i); v128_t vacc1r = wasm_f32x4_mul(va1r, vb1r); v128_t vacc1i = wasm_f32x4_mul(va1r, vb1i); vacc0r = wasm_f32x4_sub(vacc0r, wasm_f32x4_mul(va0i, vb0i)); vacc0i = wasm_f32x4_add(vacc0i, wasm_f32x4_mul(va0i, vb0r)); vacc1r = wasm_f32x4_sub(vacc1r, wasm_f32x4_mul(va1i, vb1i)); vacc1i = wasm_f32x4_add(vacc1i, wasm_f32x4_mul(va1i, vb1r)); wasm_v128_store(or, vacc0r); wasm_v128_store(oi, vacc0i); wasm_v128_store(or + 4, vacc1r); wasm_v128_store(oi + 4, vacc1i); or += 8; oi += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t var = wasm_v128_load(ar); ar += 4; const v128_t vai = wasm_v128_load(ai); ai += 4; const v128_t vbr = wasm_v128_load(br); br += 4; const v128_t vbi = wasm_v128_load(bi); bi += 4; v128_t vaccr = wasm_f32x4_mul(var, vbr); v128_t vacci = wasm_f32x4_mul(var, vbi); vaccr = wasm_f32x4_sub(vaccr, wasm_f32x4_mul(vai, vbi)); vacci = wasm_f32x4_add(vacci, wasm_f32x4_mul(vai, vbr)); wasm_v128_store(or, vaccr); or += 4; wasm_v128_store(oi, vacci); oi += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t var = wasm_v128_load(ar); ar += 4; const v128_t vai = wasm_v128_load(ai); ai += 4; const v128_t vbr = wasm_v128_load(br); br += 4; const v128_t vbi = wasm_v128_load(bi); bi += 4; v128_t vaccr = wasm_f32x4_mul(var, vbr); v128_t vacci = wasm_f32x4_mul(var, vbi); vaccr = wasm_f32x4_sub(vaccr, wasm_f32x4_mul(vai, vbi)); vacci = wasm_f32x4_add(vacci, wasm_f32x4_mul(vai, vbr)); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(or, vaccr, 0); or += 2; wasm_v128_store64_lane(oi, vacci, 0); oi += 2; vaccr = wasm_v64x2_shuffle(vaccr, vaccr, 1, 1); vacci = wasm_v64x2_shuffle(vacci, vacci, 1, 1); } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(or, vaccr, 0); wasm_v128_store32_lane(oi, vacci, 0); } } } void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20( size_t batch, const float* input, float* output, const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vprescale = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.prescale); const v128_t valpha = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.alpha); const v128_t vbeta = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.beta); const v128_t vsat_cutoff = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.sat_cutoff); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.magic_bias); const v128_t vlog2e = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.log2e); const v128_t vminus_ln2_hi = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.minus_ln2_hi); const v128_t vminus_ln2_lo = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.minus_ln2_lo); const v128_t vc6 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c6); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c2); const v128_t vone = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.one); for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); v128_t vx89AB = wasm_v128_load(input + 8); v128_t vxCDEF = wasm_v128_load(input + 12); v128_t vxGHIJ = wasm_v128_load(input + 16); input += 20; const v128_t vz0123 = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vx0123, vprescale)); const v128_t vz4567 = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vx4567, vprescale)); const v128_t vz89AB = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vx89AB, vprescale)); const v128_t vzCDEF = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vxCDEF, vprescale)); const v128_t vzGHIJ = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vxGHIJ, vprescale)); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vlog2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vlog2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vz89AB, vlog2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vzCDEF, vlog2e), vmagic_bias); v128_t vnGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vzGHIJ, vlog2e), vmagic_bias); v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vz0123); v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vz4567); v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vz89AB); v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vzCDEF); v128_t vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_hi), vzGHIJ); vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_lo), vtGHIJ); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt0123), vc5); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt4567), vc5); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt89AB), vc5); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtCDEF), vc5); v128_t vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtGHIJ), vc5); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc4); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc4); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc4); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc4); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc2); vp0123 = wasm_f32x4_mul(vp0123, vt0123); vp4567 = wasm_f32x4_mul(vp4567, vt4567); vp89AB = wasm_f32x4_mul(vp89AB, vt89AB); vpCDEF = wasm_f32x4_mul(vpCDEF, vtCDEF); vpGHIJ = wasm_f32x4_mul(vpGHIJ, vtGHIJ); vt0123 = wasm_f32x4_mul(vt0123, vs0123); vs0123 = wasm_f32x4_sub(vs0123, vone); vt4567 = wasm_f32x4_mul(vt4567, vs4567); vs4567 = wasm_f32x4_sub(vs4567, vone); vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); vs89AB = wasm_f32x4_sub(vs89AB, vone); vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); vsCDEF = wasm_f32x4_sub(vsCDEF, vone); vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); vsGHIJ = wasm_f32x4_sub(vsGHIJ, vone); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vt0123); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vt4567); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vt89AB); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vtCDEF); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vtGHIJ); const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t vsignm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); const v128_t vsignm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); const v128_t vsignm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); const v128_t vsignmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vsignm0123); const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vsignm4567); const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vsignm89AB); const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vsignmGHIJ); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); wasm_v128_store(output + 16, vyGHIJ); output += 20; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vx = wasm_v128_load(input); input += 4; const v128_t vz = wasm_f32x4_max(vsat_cutoff, wasm_f32x4_mul(vx, vprescale)); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_mul(vp, vt); vt = wasm_f32x4_mul(vt, vs); vs = wasm_f32x4_sub(vs, vone); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); const v128_t vsignm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); const v128_t vy = wasm_v128_bitselect(ve, vx, vsignm); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vx = wasm_v128_load(input); const v128_t vz = wasm_f32x4_max(wasm_f32x4_mul(vx, vprescale), vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_mul(vp, vt); vt = wasm_f32x4_mul(vt, vs); vs = wasm_f32x4_sub(vs, vone); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); const v128_t vsignm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); v128_t vy = wasm_v128_bitselect(ve, vx, vsignm); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20( size_t batch, const float* input, float* output, const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vprescale = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.prescale); const v128_t valpha = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.alpha); const v128_t vbeta = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.beta); const v128_t vsat_cutoff = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.sat_cutoff); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.magic_bias); const v128_t vlog2e = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.log2e); const v128_t vminus_ln2_hi = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.minus_ln2_hi); const v128_t vminus_ln2_lo = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.minus_ln2_lo); const v128_t vc6 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c6); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.c2); const v128_t vone = wasm_v128_load64_splat(params->wasmsimd_rr2_p6.one); for (; batch >= 20 * sizeof(float); batch -= 20 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); v128_t vx89AB = wasm_v128_load(input + 8); v128_t vxCDEF = wasm_v128_load(input + 12); v128_t vxGHIJ = wasm_v128_load(input + 16); input += 20; const v128_t vz0123 = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vx0123, vprescale)); const v128_t vz4567 = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vx4567, vprescale)); const v128_t vz89AB = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vx89AB, vprescale)); const v128_t vzCDEF = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vxCDEF, vprescale)); const v128_t vzGHIJ = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vxGHIJ, vprescale)); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vlog2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vlog2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vz89AB, vlog2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vzCDEF, vlog2e), vmagic_bias); v128_t vnGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vzGHIJ, vlog2e), vmagic_bias); v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); v128_t vsGHIJ = wasm_i32x4_shl(vnGHIJ, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); vnGHIJ = wasm_f32x4_sub(vnGHIJ, vmagic_bias); v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vz0123); v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vz4567); v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_hi), vz89AB); v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_hi), vzCDEF); v128_t vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_hi), vzGHIJ); vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123); vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567); vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2_lo), vt89AB); vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2_lo), vtCDEF); vtGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vnGHIJ, vminus_ln2_lo), vtGHIJ); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt0123), vc5); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt4567), vc5); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt89AB), vc5); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtCDEF), vc5); v128_t vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtGHIJ), vc5); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc4); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc4); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc4); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc4); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vc2); vp0123 = wasm_f32x4_mul(vp0123, vt0123); vp4567 = wasm_f32x4_mul(vp4567, vt4567); vp89AB = wasm_f32x4_mul(vp89AB, vt89AB); vpCDEF = wasm_f32x4_mul(vpCDEF, vtCDEF); vpGHIJ = wasm_f32x4_mul(vpGHIJ, vtGHIJ); vt0123 = wasm_f32x4_mul(vt0123, vs0123); vs0123 = wasm_f32x4_sub(vs0123, vone); vt4567 = wasm_f32x4_mul(vt4567, vs4567); vs4567 = wasm_f32x4_sub(vs4567, vone); vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); vs89AB = wasm_f32x4_sub(vs89AB, vone); vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); vsCDEF = wasm_f32x4_sub(vsCDEF, vone); vtGHIJ = wasm_f32x4_mul(vtGHIJ, vsGHIJ); vsGHIJ = wasm_f32x4_sub(vsGHIJ, vone); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vt0123); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vt4567); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vt89AB); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vtCDEF); vpGHIJ = wasm_f32x4_add(wasm_f32x4_mul(vpGHIJ, vtGHIJ), vtGHIJ); const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t vsignm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); const v128_t vsignm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); const v128_t vsignm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); const v128_t vsignmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); const v128_t vsignmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vsignm0123); const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vsignm4567); const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vsignm89AB); const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vsignmCDEF); const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vsignmGHIJ); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); wasm_v128_store(output + 16, vyGHIJ); output += 20; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vx = wasm_v128_load(input); input += 4; const v128_t vz = wasm_f32x4_pmax(vsat_cutoff, wasm_f32x4_mul(vx, vprescale)); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_mul(vp, vt); vt = wasm_f32x4_mul(vt, vs); vs = wasm_f32x4_sub(vs, vone); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); const v128_t vsignm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); const v128_t vy = wasm_v128_bitselect(ve, vx, vsignm); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vx = wasm_v128_load(input); const v128_t vz = wasm_f32x4_pmax(wasm_f32x4_mul(vx, vprescale), vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_mul(vp, vt); vt = wasm_f32x4_mul(vt, vs); vs = wasm_f32x4_sub(vs, vone); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); const v128_t vsignm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); v128_t vy = wasm_v128_bitselect(ve, vx, vsignm); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vhswish_ukernel__wasmsimd_x16( size_t batch, const float* input, float* output, const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vsixth = wasm_v128_load64_splat(params->wasmsimd.sixth); const v128_t vthree = wasm_v128_load64_splat(params->wasmsimd.three); const v128_t vsix = wasm_v128_load64_splat(params->wasmsimd.six); const v128_t vzero = wasm_i32x4_const_splat(0); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); v128_t vx89AB = wasm_v128_load(input + 8); v128_t vxCDEF = wasm_v128_load(input + 12); input += 16; v128_t vacc0123 = wasm_f32x4_add(vx0123, vthree); vx0123 = wasm_f32x4_mul(vx0123, vsixth); v128_t vacc4567 = wasm_f32x4_add(vx4567, vthree); vx4567 = wasm_f32x4_mul(vx4567, vsixth); v128_t vacc89AB = wasm_f32x4_add(vx89AB, vthree); vx89AB = wasm_f32x4_mul(vx89AB, vsixth); v128_t vaccCDEF = wasm_f32x4_add(vxCDEF, vthree); vxCDEF = wasm_f32x4_mul(vxCDEF, vsixth); vacc0123 = wasm_i32x4_max(vacc0123, vzero); vacc4567 = wasm_i32x4_max(vacc4567, vzero); vacc89AB = wasm_i32x4_max(vacc89AB, vzero); vaccCDEF = wasm_i32x4_max(vaccCDEF, vzero); vacc0123 = wasm_i32x4_min(vacc0123, vsix); vacc4567 = wasm_i32x4_min(vacc4567, vsix); vacc89AB = wasm_i32x4_min(vacc89AB, vsix); vaccCDEF = wasm_i32x4_min(vaccCDEF, vsix); vacc0123 = wasm_f32x4_mul(vacc0123, vx0123); vacc4567 = wasm_f32x4_mul(vacc4567, vx4567); vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); wasm_v128_store(output + 8, vacc89AB); wasm_v128_store(output + 12, vaccCDEF); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vx = wasm_v128_load(input); input += 4; v128_t vacc = wasm_f32x4_add(vx, vthree); vx = wasm_f32x4_mul(vx, vsixth); vacc = wasm_i32x4_max(vacc, vzero); vacc = wasm_i32x4_min(vacc, vsix); vacc = wasm_f32x4_mul(vacc, vx); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vx = wasm_v128_load(input); v128_t vacc = wasm_f32x4_add(vx, vthree); vx = wasm_f32x4_mul(vx, vsixth); vacc = wasm_i32x4_max(vacc, vzero); vacc = wasm_i32x4_min(vacc, vsix); vacc = wasm_f32x4_mul(vacc, vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vlrelu_ukernel__wasmsimd_iminmax_x8( size_t batch, const float* input, float* output, const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vslope = wasm_v128_load64_splat(params->wasmsimd.slope); const v128_t vzero = wasm_i32x4_const_splat(0); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { v128_t vx0123 = wasm_v128_load(input); v128_t vx4567 = wasm_v128_load(input + 4); input += 8; v128_t vacc0123 = wasm_i32x4_max(vx0123, vzero); vx0123 = wasm_i32x4_min(vx0123, vzero); v128_t vacc4567 = wasm_i32x4_max(vx4567, vzero); vx4567 = wasm_i32x4_min(vx4567, vzero); vacc0123 = wasm_f32x4_add(wasm_f32x4_mul(vx0123, vslope), vacc0123); vacc4567 = wasm_f32x4_add(wasm_f32x4_mul(vx4567, vslope), vacc4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vx = wasm_v128_load(input); input += 4; v128_t vacc = wasm_i32x4_max(vx, vzero); vx = wasm_i32x4_min(vx, vzero); vacc = wasm_f32x4_add(wasm_f32x4_mul(vx, vslope), vacc); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vx = wasm_v128_load(input); v128_t vacc = wasm_i32x4_max(vx, vzero); vx = wasm_i32x4_min(vx, vzero); vacc = wasm_f32x4_add(wasm_f32x4_mul(vx, vslope), vacc); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vlrelu_ukernel__wasmsimd_laneselect_x8( size_t batch, const float* input, float* output, const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vslope = wasm_v128_load64_splat(params->wasmsimd.slope); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); input += 8; v128_t vacc0123 = wasm_f32x4_mul(vx0123, vslope); const v128_t vmask0123 = wasm_i32x4_shr(vx0123, 31); v128_t vacc4567 = wasm_f32x4_mul(vx4567, vslope); const v128_t vmask4567 = wasm_i32x4_shr(vx4567, 31); vacc0123 = wasm_v128_bitselect(vacc0123, vx0123, vmask0123); vacc4567 = wasm_v128_bitselect(vacc4567, vx4567, vmask4567); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; v128_t vacc = wasm_f32x4_mul(vx, vslope); const v128_t vmask = wasm_i32x4_shr(vx, 31); vacc = wasm_v128_bitselect(vacc, vx, vmask); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vacc = wasm_f32x4_mul(vx, vslope); const v128_t vmask = wasm_i32x4_shr(vx, 31); vacc = wasm_v128_bitselect(vacc, vx, vmask); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const v128_t vscale0123 = wasm_v128_load(w); v128_t vacc0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vacc1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vbias0123 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vscale0123, vacc0x0123), vbias0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vscale0123, vacc1x0123), vbias0123); vacc0x0123 = wasm_f32x4_max(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_max(vmin, vacc1x0123); vacc0x0123 = wasm_f32x4_min(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_min(vmax, vacc1x0123); wasm_v128_store(o0, vacc0x0123); o0 += 4; wasm_v128_store(o1, vacc1x0123); o1 += 4; w += 8; } if XNN_UNLIKELY(c != 0) { const v128_t vscale = wasm_v128_load(w); v128_t vacc0 = wasm_v128_load(i0); i0 = (const float*) ((uintptr_t) i0 + c); v128_t vacc1 = wasm_v128_load(i1); i1 = (const float*) ((uintptr_t) i1 + c); const v128_t vbias = wasm_v128_load(w + 4); vacc0 = wasm_f32x4_add(wasm_f32x4_mul(vscale, vacc0), vbias); vacc1 = wasm_f32x4_add(wasm_f32x4_mul(vscale, vacc1), vbias); vacc0 = wasm_f32x4_max(vmin, vacc0); vacc1 = wasm_f32x4_max(vmin, vacc1); vacc0 = wasm_f32x4_min(vmax, vacc0); vacc1 = wasm_f32x4_min(vmax, vacc1); if (c & (2 * sizeof(float))) { wasm_v128_store64_lane(o0, vacc0, 0); wasm_v128_store64_lane(o1, vacc1, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); vacc1 = wasm_v64x2_shuffle(vacc1, vacc1, 1, 1); o0 += 2; o1 += 2; } if (c & (1 * sizeof(float))) { wasm_v128_store32_lane(o0, vacc0, 0); o0 += 1; wasm_v128_store32_lane(o1, vacc1, 0); o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x( size_t rows, size_t channels, const float* restrict input, size_t input_stride, const float* restrict weights, float* restrict output, size_t output_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(channels != 0); assert(channels % sizeof(float) == 0); const float* i0 = input; float* o0 = output; const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); float* o1 = (float*) ((uintptr_t) o0 + output_stride); const size_t input_increment = input_stride * 2 - channels; const size_t output_increment = output_stride * 2 - channels; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); do { if XNN_UNPREDICTABLE(rows < 2) { i1 = i0; o1 = o0; } const float* w = weights; size_t c = channels; for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) { const v128_t vscale0123 = wasm_v128_load(w); v128_t vacc0x0123 = wasm_v128_load(i0); i0 += 4; v128_t vacc1x0123 = wasm_v128_load(i1); i1 += 4; const v128_t vbias0123 = wasm_v128_load(w + 4); vacc0x0123 = wasm_f32x4_add(wasm_f32x4_mul(vscale0123, vacc0x0123), vbias0123); vacc1x0123 = wasm_f32x4_add(wasm_f32x4_mul(vscale0123, vacc1x0123), vbias0123); vacc0x0123 = wasm_f32x4_pmax(vmin, vacc0x0123); vacc1x0123 = wasm_f32x4_pmax(vmin, vacc1x0123); vacc0x0123 = wasm_f32x4_pmin(vmax, vacc0x0123); vacc1x0123 = wasm_f32x4_pmin(vmax, vacc1x0123); wasm_v128_store(o0, vacc0x0123); o0 += 4; wasm_v128_store(o1, vacc1x0123); o1 += 4; w += 8; } if XNN_UNLIKELY(c != 0) { const v128_t vscale = wasm_v128_load(w); v128_t vacc0 = wasm_v128_load(i0); i0 = (const float*) ((uintptr_t) i0 + c); v128_t vacc1 = wasm_v128_load(i1); i1 = (const float*) ((uintptr_t) i1 + c); const v128_t vbias = wasm_v128_load(w + 4); vacc0 = wasm_f32x4_add(wasm_f32x4_mul(vscale, vacc0), vbias); vacc1 = wasm_f32x4_add(wasm_f32x4_mul(vscale, vacc1), vbias); vacc0 = wasm_f32x4_pmax(vmin, vacc0); vacc1 = wasm_f32x4_pmax(vmin, vacc1); vacc0 = wasm_f32x4_pmin(vmax, vacc0); vacc1 = wasm_f32x4_pmin(vmax, vacc1); if (c & (2 * sizeof(float))) { wasm_v128_store64_lane(o0, vacc0, 0); wasm_v128_store64_lane(o1, vacc1, 0); vacc0 = wasm_v64x2_shuffle(vacc0, vacc0, 1, 1); vacc1 = wasm_v64x2_shuffle(vacc1, vacc1, 1, 1); o0 += 2; o1 += 2; } if (c & (1 * sizeof(float))) { wasm_v128_store32_lane(o0, vacc0, 0); o0 += 1; wasm_v128_store32_lane(o1, vacc1, 0); o1 += 1; } } i0 = (const float*) ((uintptr_t) i0 + input_increment); o0 = (float*) ((uintptr_t) o0 + output_increment); i1 = (const float*) ((uintptr_t) i1 + input_increment); o1 = (float*) ((uintptr_t) o1 + output_increment); rows = doz(rows, 2); } while (rows != 0); } void xnn_f32_vrelu_ukernel__wasmsimd_x16( size_t batch, const float* input, float* output, const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vzero = wasm_i32x4_const_splat(0); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { v128_t vacc0123 = wasm_v128_load(input); v128_t vacc4567 = wasm_v128_load(input + 4); v128_t vacc89AB = wasm_v128_load(input + 8); v128_t vaccCDEF = wasm_v128_load(input + 12); input += 16; vacc0123 = wasm_i32x4_max(vacc0123, vzero); vacc4567 = wasm_i32x4_max(vacc4567, vzero); vacc89AB = wasm_i32x4_max(vacc89AB, vzero); vaccCDEF = wasm_i32x4_max(vaccCDEF, vzero); wasm_v128_store(output, vacc0123); wasm_v128_store(output + 4, vacc4567); wasm_v128_store(output + 8, vacc89AB); wasm_v128_store(output + 12, vaccCDEF); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { v128_t vacc = wasm_v128_load(input); input += 4; vacc = wasm_i32x4_max(vacc, vzero); wasm_v128_store(output, vacc); output += 4; } if XNN_UNLIKELY(batch != 0) { v128_t vacc = wasm_v128_load(input); vacc = wasm_i32x4_max(vacc, vzero); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vacc, 0); } } } void xnn_f32_vrndd_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); input += 4; const v128_t vx4567 = wasm_v128_load(input); input += 4; const v128_t vy0123 = wasm_f32x4_floor(vx0123); const v128_t vy4567 = wasm_f32x4_floor(vx4567); wasm_v128_store(output, vy0123); output += 4; wasm_v128_store(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_floor(vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_floor(vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrndne_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); input += 4; const v128_t vx4567 = wasm_v128_load(input); input += 4; const v128_t vy0123 = wasm_f32x4_nearest(vx0123); const v128_t vy4567 = wasm_f32x4_nearest(vx4567); wasm_v128_store(output, vy0123); output += 4; wasm_v128_store(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_nearest(vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_nearest(vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrndu_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); input += 4; const v128_t vx4567 = wasm_v128_load(input); input += 4; const v128_t vy0123 = wasm_f32x4_ceil(vx0123); const v128_t vy4567 = wasm_f32x4_ceil(vx4567); wasm_v128_store(output, vy0123); output += 4; wasm_v128_store(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_ceil(vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_ceil(vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vrndz_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); input += 4; const v128_t vx4567 = wasm_v128_load(input); input += 4; const v128_t vy0123 = wasm_f32x4_trunc(vx0123); const v128_t vy4567 = wasm_f32x4_trunc(vx4567); wasm_v128_store(output, vy0123); output += 4; wasm_v128_store(output, vy4567); output += 4; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_trunc(vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_trunc(vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16( size_t batch, const float* input, float* output, const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.magic_bias); const v128_t vminus_log2e = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.minus_log2e); const v128_t vln2_hi = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.ln2_hi); const v128_t vln2_lo = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.ln2_lo); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c2); const v128_t vc1 = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.c1); const v128_t vone = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.one); const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_rr2_p5.denorm_cutoff); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); const v128_t vx89AB = wasm_v128_load(input + 8); const v128_t vxCDEF = wasm_v128_load(input + 12); input += 16; const v128_t vz0123 = wasm_f32x4_abs(vx0123); const v128_t vz4567 = wasm_f32x4_abs(vx4567); const v128_t vz89AB = wasm_f32x4_abs(vx89AB); const v128_t vzCDEF = wasm_f32x4_abs(vxCDEF); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vminus_log2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vminus_log2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vz89AB, vminus_log2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vzCDEF, vminus_log2e), vmagic_bias); const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vln2_hi), vz0123); v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vln2_hi), vz4567); v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vln2_hi), vz89AB); v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vln2_hi), vzCDEF); vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vln2_lo), vt0123); vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vln2_lo), vt4567); vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vln2_lo), vt89AB); vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vln2_lo), vtCDEF); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vc5), vc4); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vc5), vc4); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vc5), vc4); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vc5), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vc2); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vc1); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vc1); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vc1); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vc1); vt0123 = wasm_f32x4_mul(vt0123, vs0123); vt4567 = wasm_f32x4_mul(vt4567, vs4567); vt89AB = wasm_f32x4_mul(vt89AB, vs89AB); vtCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); const v128_t ve0123 = wasm_f32x4_add(wasm_f32x4_mul(vt0123, vp0123), vs0123); const v128_t ve4567 = wasm_f32x4_add(wasm_f32x4_mul(vt4567, vp4567), vs4567); const v128_t ve89AB = wasm_f32x4_add(wasm_f32x4_mul(vt89AB, vp89AB), vs89AB); const v128_t veCDEF = wasm_f32x4_add(wasm_f32x4_mul(vtCDEF, vpCDEF), vsCDEF); const v128_t vd0123 = wasm_f32x4_add(ve0123, vone); const v128_t vd4567 = wasm_f32x4_add(ve4567, vone); const v128_t vd89AB = wasm_f32x4_add(ve89AB, vone); const v128_t vdCDEF = wasm_f32x4_add(veCDEF, vone); v128_t vf0123 = wasm_f32x4_div(ve0123, vd0123); v128_t vf4567 = wasm_f32x4_div(ve4567, vd4567); v128_t vf89AB = wasm_f32x4_div(ve89AB, vd89AB); v128_t vfCDEF = wasm_f32x4_div(veCDEF, vdCDEF); vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_gt(vz0123, vdenorm_cutoff)); vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_gt(vz4567, vdenorm_cutoff)); vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_gt(vz89AB, vdenorm_cutoff)); vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_gt(vzCDEF, vdenorm_cutoff)); const v128_t vcf0123 = wasm_f32x4_sub(vone, vf0123); const v128_t vcf4567 = wasm_f32x4_sub(vone, vf4567); const v128_t vcf89AB = wasm_f32x4_sub(vone, vf89AB); const v128_t vcfCDEF = wasm_f32x4_sub(vone, vfCDEF); vf0123 = wasm_v128_bitselect(vf0123, vcf0123, wasm_i32x4_shr(vx0123, 31)); vf4567 = wasm_v128_bitselect(vf4567, vcf4567, wasm_i32x4_shr(vx4567, 31)); vf89AB = wasm_v128_bitselect(vf89AB, vcf89AB, wasm_i32x4_shr(vx89AB, 31)); vfCDEF = wasm_v128_bitselect(vfCDEF, vcfCDEF, wasm_i32x4_shr(vxCDEF, 31)); wasm_v128_store(output, vf0123); wasm_v128_store(output + 4, vf4567); wasm_v128_store(output + 8, vf89AB); wasm_v128_store(output + 12, vfCDEF); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vz = wasm_f32x4_abs(vx); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vminus_log2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vc5), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc1); vt = wasm_f32x4_mul(vt, vs); const v128_t ve = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); const v128_t vd = wasm_f32x4_add(ve, vone); v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); const v128_t vcf = wasm_f32x4_sub(vone, vf); vf = wasm_v128_bitselect(vf, vcf, wasm_i32x4_shr(vx, 31)); wasm_v128_store(output, vf); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); const v128_t vz = wasm_f32x4_abs(vx); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vminus_log2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2_hi), vz); vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2_lo), vt); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vc5), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vc1); vt = wasm_f32x4_mul(vt, vs); const v128_t ve = wasm_f32x4_add(wasm_f32x4_mul(vt, vp), vs); const v128_t vd = wasm_f32x4_add(ve, vone); v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); const v128_t vcf = wasm_f32x4_sub(vone, vf); vf = wasm_v128_bitselect(vf, vcf, wasm_i32x4_shr(vx, 31)); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vf, 0); vf = wasm_v64x2_shuffle(vf, vf, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vf, 0); } } } void xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); input += 8; const v128_t vy0123 = wasm_f32x4_sqrt(vx0123); const v128_t vy4567 = wasm_f32x4_sqrt(vx4567); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_sqrt(vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_sqrt(vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vtanh_ukernel__wasmsimd_expm1minus_rr1_p6h5ts_div_abs_min_x16( size_t batch, const float* input, float* output, const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vsat_cutoff = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.sat_cutoff); const v128_t vminus_log2e = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.minus_log2e); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.magic_bias); const v128_t vln2 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.ln2); const v128_t vc6 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.c6); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.c2); const v128_t vminus_two = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.minus_two); const v128_t vone = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.one); const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_abs.sign_mask); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); const v128_t vx89AB = wasm_v128_load(input + 8); const v128_t vxCDEF = wasm_v128_load(input + 12); input += 16; v128_t vz0123 = wasm_f32x4_abs(vx0123); v128_t vz4567 = wasm_f32x4_abs(vx4567); v128_t vz89AB = wasm_f32x4_abs(vx89AB); v128_t vzCDEF = wasm_f32x4_abs(vxCDEF); vz0123 = wasm_f32x4_min(vz0123, vsat_cutoff); vz4567 = wasm_f32x4_min(vz4567, vsat_cutoff); vz89AB = wasm_f32x4_min(vz89AB, vsat_cutoff); vzCDEF = wasm_f32x4_min(vzCDEF, vsat_cutoff); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vminus_log2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vminus_log2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vz89AB, vminus_log2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vzCDEF, vminus_log2e), vmagic_bias); const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); const v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vln2), vz0123); const v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vln2), vz4567); const v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vln2), vz89AB); const v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vln2), vzCDEF); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt0123), vc5); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt4567), vc5); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt89AB), vc5); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtCDEF), vc5); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc4); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc4); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc4); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vminus_two); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vminus_two); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vminus_two); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vminus_two); const v128_t vts0123 = wasm_f32x4_mul(vt0123, vs0123); const v128_t vsmo0123 = wasm_f32x4_sub(vs0123, vone); const v128_t vts4567 = wasm_f32x4_mul(vt4567, vs4567); const v128_t vsmo4567 = wasm_f32x4_sub(vs4567, vone); const v128_t vts89AB = wasm_f32x4_mul(vt89AB, vs89AB); const v128_t vsmo89AB = wasm_f32x4_sub(vs89AB, vone); const v128_t vtsCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); const v128_t vsmoCDEF = wasm_f32x4_sub(vsCDEF, vone); const v128_t vemo0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vts0123), vsmo0123); const v128_t vemo4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vts4567), vsmo4567); const v128_t vemo89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vts89AB), vsmo89AB); const v128_t vemoCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtsCDEF), vsmoCDEF); const v128_t vepo0123 = wasm_f32x4_sub(vemo0123, vminus_two); const v128_t vepo4567 = wasm_f32x4_sub(vemo4567, vminus_two); const v128_t vepo89AB = wasm_f32x4_sub(vemo89AB, vminus_two); const v128_t vepoCDEF = wasm_f32x4_sub(vemoCDEF, vminus_two); v128_t vy0123 = wasm_f32x4_div(vemo0123, vepo0123); v128_t vy4567 = wasm_f32x4_div(vemo4567, vepo4567); v128_t vy89AB = wasm_f32x4_div(vemo89AB, vepo89AB); v128_t vyCDEF = wasm_f32x4_div(vemoCDEF, vepoCDEF); vy0123 = wasm_v128_bitselect(vx0123, vy0123, vsign_mask); vy4567 = wasm_v128_bitselect(vx4567, vy4567, vsign_mask); vy89AB = wasm_v128_bitselect(vx89AB, vy89AB, vsign_mask); vyCDEF = wasm_v128_bitselect(vxCDEF, vyCDEF, vsign_mask); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; v128_t vz = wasm_f32x4_abs(vx); vz = wasm_f32x4_min(vz, vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vminus_log2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2), vz); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vminus_two); const v128_t vts = wasm_f32x4_mul(vt, vs); const v128_t vsmo = wasm_f32x4_sub(vs, vone); const v128_t vemo = wasm_f32x4_add(wasm_f32x4_mul(vp, vts), vsmo); const v128_t vepo = wasm_f32x4_sub(vemo, vminus_two); v128_t vy = wasm_f32x4_div(vemo, vepo); vy = wasm_v128_bitselect(vx, vy, vsign_mask); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vz = wasm_f32x4_abs(vx); vz = wasm_f32x4_min(vz, vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vminus_log2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vln2), vz); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vminus_two); const v128_t vts = wasm_f32x4_mul(vt, vs); const v128_t vsmo = wasm_f32x4_sub(vs, vone); const v128_t vemo = wasm_f32x4_add(wasm_f32x4_mul(vp, vts), vsmo); const v128_t vepo = wasm_f32x4_sub(vemo, vminus_two); v128_t vy = wasm_f32x4_div(vemo, vepo); vy = wasm_v128_bitselect(vx, vy, vsign_mask); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vtanh_ukernel__wasmsimd_expm1minus_rr1_p6h5ts_div_nabs_pmax_x16( size_t batch, const float* input, float* output, const union xnn_f32_tanh_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.sign_mask); const v128_t vsat_cutoff = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.sat_cutoff); const v128_t vlog2e = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.log2e); const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.magic_bias); const v128_t vminus_ln2 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.minus_ln2); const v128_t vc6 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.c6); const v128_t vc5 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.c5); const v128_t vc4 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.c4); const v128_t vc3 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.c3); const v128_t vc2 = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.c2); const v128_t vtwo = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.two); const v128_t vone = wasm_v128_load64_splat(params->wasmsimd_expm1minus_rr1_p6h5_nabs.one); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); const v128_t vx89AB = wasm_v128_load(input + 8); const v128_t vxCDEF = wasm_v128_load(input + 12); input += 16; v128_t vz0123 = wasm_v128_or(vx0123, vsign_mask); v128_t vz4567 = wasm_v128_or(vx4567, vsign_mask); v128_t vz89AB = wasm_v128_or(vx89AB, vsign_mask); v128_t vzCDEF = wasm_v128_or(vxCDEF, vsign_mask); const v128_t vinvsignx0123 = wasm_v128_xor(vx0123, vz0123); vz0123 = wasm_f32x4_pmax(vz0123, vsat_cutoff); const v128_t vinvsignx4567 = wasm_v128_xor(vx4567, vz4567); vz4567 = wasm_f32x4_pmax(vz4567, vsat_cutoff); const v128_t vinvsignx89AB = wasm_v128_xor(vx89AB, vz89AB); vz89AB = wasm_f32x4_pmax(vz89AB, vsat_cutoff); const v128_t vinvsignxCDEF = wasm_v128_xor(vxCDEF, vzCDEF); vzCDEF = wasm_f32x4_pmax(vzCDEF, vsat_cutoff); v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vlog2e), vmagic_bias); v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vlog2e), vmagic_bias); v128_t vn89AB = wasm_f32x4_add(wasm_f32x4_mul(vz89AB, vlog2e), vmagic_bias); v128_t vnCDEF = wasm_f32x4_add(wasm_f32x4_mul(vzCDEF, vlog2e), vmagic_bias); const v128_t vs0123 = wasm_i32x4_shl(vn0123, 23); vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias); const v128_t vs4567 = wasm_i32x4_shl(vn4567, 23); vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias); const v128_t vs89AB = wasm_i32x4_shl(vn89AB, 23); vn89AB = wasm_f32x4_sub(vn89AB, vmagic_bias); const v128_t vsCDEF = wasm_i32x4_shl(vnCDEF, 23); vnCDEF = wasm_f32x4_sub(vnCDEF, vmagic_bias); const v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2), vz0123); const v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2), vz4567); const v128_t vt89AB = wasm_f32x4_add(wasm_f32x4_mul(vn89AB, vminus_ln2), vz89AB); const v128_t vtCDEF = wasm_f32x4_add(wasm_f32x4_mul(vnCDEF, vminus_ln2), vzCDEF); v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt0123), vc5); v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt4567), vc5); v128_t vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt89AB), vc5); v128_t vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vc6, vtCDEF), vc5); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc4); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc4); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc4); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc4); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc3); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc3); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc3); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc3); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vc2); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vc2); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vc2); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vc2); vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vtwo); vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vtwo); vp89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vt89AB), vtwo); vpCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtCDEF), vtwo); const v128_t vts0123 = wasm_f32x4_mul(vt0123, vs0123); const v128_t vsmo0123 = wasm_f32x4_sub(vs0123, vone); const v128_t vts4567 = wasm_f32x4_mul(vt4567, vs4567); const v128_t vsmo4567 = wasm_f32x4_sub(vs4567, vone); const v128_t vts89AB = wasm_f32x4_mul(vt89AB, vs89AB); const v128_t vsmo89AB = wasm_f32x4_sub(vs89AB, vone); const v128_t vtsCDEF = wasm_f32x4_mul(vtCDEF, vsCDEF); const v128_t vsmoCDEF = wasm_f32x4_sub(vsCDEF, vone); const v128_t vemo0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vts0123), vsmo0123); const v128_t vemo4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vts4567), vsmo4567); const v128_t vemo89AB = wasm_f32x4_add(wasm_f32x4_mul(vp89AB, vts89AB), vsmo89AB); const v128_t vemoCDEF = wasm_f32x4_add(wasm_f32x4_mul(vpCDEF, vtsCDEF), vsmoCDEF); const v128_t vepo0123 = wasm_f32x4_add(vemo0123, vtwo); const v128_t vepo4567 = wasm_f32x4_add(vemo4567, vtwo); const v128_t vepo89AB = wasm_f32x4_add(vemo89AB, vtwo); const v128_t vepoCDEF = wasm_f32x4_add(vemoCDEF, vtwo); v128_t vy0123 = wasm_f32x4_div(vemo0123, vepo0123); v128_t vy4567 = wasm_f32x4_div(vemo4567, vepo4567); v128_t vy89AB = wasm_f32x4_div(vemo89AB, vepo89AB); v128_t vyCDEF = wasm_f32x4_div(vemoCDEF, vepoCDEF); vy0123 = wasm_v128_xor(vy0123, vinvsignx0123); vy4567 = wasm_v128_xor(vy4567, vinvsignx4567); vy89AB = wasm_v128_xor(vy89AB, vinvsignx89AB); vyCDEF = wasm_v128_xor(vyCDEF, vinvsignxCDEF); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); output += 16; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; v128_t vz = wasm_v128_or(vx, vsign_mask); const v128_t vinvsignx = wasm_v128_xor(vx, vz); vz = wasm_f32x4_pmax(vz, vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2), vz); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vtwo); const v128_t vts = wasm_f32x4_mul(vt, vs); const v128_t vsmo = wasm_f32x4_sub(vs, vone); const v128_t vemo = wasm_f32x4_add(wasm_f32x4_mul(vp, vts), vsmo); const v128_t vepo = wasm_f32x4_add(vemo, vtwo); v128_t vy = wasm_f32x4_div(vemo, vepo); vy = wasm_v128_xor(vy, vinvsignx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vz = wasm_v128_or(vx, vsign_mask); const v128_t vinvsignx = wasm_v128_xor(vx, vz); vz = wasm_f32x4_pmax(vz, vsat_cutoff); v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias); const v128_t vs = wasm_i32x4_shl(vn, 23); vn = wasm_f32x4_sub(vn, vmagic_bias); const v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2), vz); v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc6, vt), vc5); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc4); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc3); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vc2); vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vtwo); const v128_t vts = wasm_f32x4_mul(vt, vs); const v128_t vsmo = wasm_f32x4_sub(vs, vone); const v128_t vemo = wasm_f32x4_add(wasm_f32x4_mul(vp, vts), vsmo); const v128_t vepo = wasm_f32x4_add(vemo, vtwo); v128_t vy = wasm_f32x4_div(vemo, vepo); vy = wasm_v128_xor(vy, vinvsignx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vabs_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vnonsign_mask = wasm_v128_load64_splat(¶ms->wasmsimd.nonsign_mask); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); input += 8; const v128_t vy0123 = wasm_v128_and(vx0123, vnonsign_mask); const v128_t vy4567 = wasm_v128_and(vx4567, vnonsign_mask); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_v128_and(vx, vnonsign_mask); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_v128_and(vx, vnonsign_mask); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vneg_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); const v128_t vsign_mask = wasm_v128_load64_splat(¶ms->wasmsimd.sign_mask); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); input += 8; const v128_t vy0123 = wasm_v128_xor(vx0123, vsign_mask); const v128_t vy4567 = wasm_v128_xor(vx4567, vsign_mask); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_v128_xor(vx, vsign_mask); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_v128_xor(vx, vsign_mask); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_f32_vsqr_ukernel__wasmsimd_x8( size_t batch, const float* input, float* output, const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const v128_t vx0123 = wasm_v128_load(input); const v128_t vx4567 = wasm_v128_load(input + 4); input += 8; const v128_t vy0123 = wasm_f32x4_mul(vx0123, vx0123); const v128_t vy4567 = wasm_f32x4_mul(vx4567, vx4567); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); output += 8; } for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const v128_t vx = wasm_v128_load(input); input += 4; const v128_t vy = wasm_f32x4_mul(vx, vx); wasm_v128_store(output, vy); output += 4; } if XNN_UNLIKELY(batch != 0) { const v128_t vx = wasm_v128_load(input); v128_t vy = wasm_f32x4_mul(vx, vx); if (batch & (2 * sizeof(float))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(float))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; float* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { const v128_t vksum0123 = wasm_v128_load(w); const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); w = (const float*) w + 4; const v128_t vbias0123 = wasm_v128_load(w); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); w = (const float*) w + 4; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); if XNN_LIKELY(nc >= 4) { wasm_v128_store(c0, vacc0x0123); a0 = (const int8_t*) ((uintptr_t) a0 - kc); c0 = (float*) ((uintptr_t) c0 + cn_stride); nc -= 4; } else { if (nc & 2) { wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, float* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; float* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); float* c1 = (float*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); float* c2 = (float*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); float* c3 = (float*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { const v128_t vksum0123 = wasm_v128_load(w); const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); const v128_t vinput_zero_point1 = wasm_v128_load32_splat(&quantization_params[1].zero_point); const v128_t vinput_zero_point2 = wasm_v128_load32_splat(&quantization_params[2].zero_point); const v128_t vinput_zero_point3 = wasm_v128_load32_splat(&quantization_params[3].zero_point); v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); v128_t vacc1x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point1); v128_t vacc2x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point2); v128_t vacc3x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point3); w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; v128_t vxa1 = wasm_i16x8_load8x8((const v128_t*) a1); a1 += 8; v128_t vxa2 = wasm_i16x8_load8x8((const v128_t*) a2); a2 += 8; v128_t vxa3 = wasm_i16x8_load8x8((const v128_t*) a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); w = (const float*) w + 4; const v128_t vbias0123 = wasm_v128_load(w); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); w = (const float*) w + 4; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); if XNN_LIKELY(nc >= 4) { wasm_v128_store(c3, vacc3x0123); wasm_v128_store(c2, vacc2x0123); wasm_v128_store(c1, vacc1x0123); wasm_v128_store(c0, vacc0x0123); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); c0 = (float*) ((uintptr_t) c0 + cn_stride); c1 = (float*) ((uintptr_t) c1 + cn_stride); c2 = (float*) ((uintptr_t) c2 + cn_stride); c3 = (float*) ((uintptr_t) c3 + cn_stride); nc -= 4; } else { if (nc & 2) { wasm_v128_store64_lane(c3, vacc3x0123, 0); vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); c3 += 2; wasm_v128_store64_lane(c2, vacc2x0123, 0); vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); c2 += 2; wasm_v128_store64_lane(c1, vacc1x0123, 0); vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); c1 += 2; wasm_v128_store64_lane(c0, vacc0x0123, 0); vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); c0 += 2; } if (nc & 1) { wasm_v128_store32_lane(c3, vacc3x0123, 0); wasm_v128_store32_lane(c2, vacc2x0123, 0); wasm_v128_store32_lane(c1, vacc1x0123, 0); wasm_v128_store32_lane(c0, vacc0x0123, 0); } nc = 0; } } while (nc != 0); } void xnn_qs16_qs8_vcvt_ukernel__wasmsimd_x16( size_t batch, const int16_t* input, int8_t* output, const union xnn_qs16_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int16_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier); const v128_t vbias = wasm_v128_load64_splat(¶ms->wasmsimd.bias); for (; batch >= 16 * sizeof(int16_t); batch -= 16 * sizeof(int16_t)) { const v128_t vx0 = wasm_i32x4_load16x4(input); input += 4; const v128_t vx1 = wasm_i32x4_load16x4(input); input += 4; const v128_t vx2 = wasm_i32x4_load16x4(input); input += 4; const v128_t vx3 = wasm_i32x4_load16x4(input); input += 4; v128_t vacc0lo = wasm_i64x2_extmul_low_i32x4(vx0, vmultiplier); v128_t vacc0hi = wasm_i64x2_extmul_high_i32x4(vx0, vmultiplier); v128_t vacc1lo = wasm_i64x2_extmul_low_i32x4(vx1, vmultiplier); v128_t vacc1hi = wasm_i64x2_extmul_high_i32x4(vx1, vmultiplier); v128_t vacc2lo = wasm_i64x2_extmul_low_i32x4(vx2, vmultiplier); v128_t vacc2hi = wasm_i64x2_extmul_high_i32x4(vx2, vmultiplier); v128_t vacc3lo = wasm_i64x2_extmul_low_i32x4(vx3, vmultiplier); v128_t vacc3hi = wasm_i64x2_extmul_high_i32x4(vx3, vmultiplier); vacc0lo = wasm_i64x2_add(vacc0lo, vbias); vacc0hi = wasm_i64x2_add(vacc0hi, vbias); vacc1lo = wasm_i64x2_add(vacc1lo, vbias); vacc1hi = wasm_i64x2_add(vacc1hi, vbias); vacc2lo = wasm_i64x2_add(vacc2lo, vbias); vacc2hi = wasm_i64x2_add(vacc2hi, vbias); vacc3lo = wasm_i64x2_add(vacc3lo, vbias); vacc3hi = wasm_i64x2_add(vacc3hi, vbias); vacc0lo = wasm_i64x2_shr(vacc0lo, 16); vacc0hi = wasm_i64x2_shr(vacc0hi, 16); vacc1lo = wasm_i64x2_shr(vacc1lo, 16); vacc1hi = wasm_i64x2_shr(vacc1hi, 16); vacc2lo = wasm_i64x2_shr(vacc2lo, 16); vacc2hi = wasm_i64x2_shr(vacc2hi, 16); vacc3lo = wasm_i64x2_shr(vacc3lo, 16); vacc3hi = wasm_i64x2_shr(vacc3hi, 16); v128_t vacc0 = wasm_v32x4_shuffle(vacc0lo, vacc0hi, 0, 2, 4, 6); v128_t vacc1 = wasm_v32x4_shuffle(vacc1lo, vacc1hi, 0, 2, 4, 6); v128_t vacc2 = wasm_v32x4_shuffle(vacc2lo, vacc2hi, 0, 2, 4, 6); v128_t vacc3 = wasm_v32x4_shuffle(vacc3lo, vacc3hi, 0, 2, 4, 6); vacc0 = wasm_i16x8_narrow_i32x4(vacc0, vacc0); vacc1 = wasm_i16x8_narrow_i32x4(vacc1, vacc1); vacc2 = wasm_i16x8_narrow_i32x4(vacc2, vacc2); vacc3 = wasm_i16x8_narrow_i32x4(vacc3, vacc3); const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc0); const v128_t vy1 = wasm_i8x16_narrow_i16x8(vacc1, vacc1); const v128_t vy2 = wasm_i8x16_narrow_i16x8(vacc2, vacc2); const v128_t vy3 = wasm_i8x16_narrow_i16x8(vacc3, vacc3); wasm_v128_store32_lane(output, vy0, 0); output += 4; wasm_v128_store32_lane(output, vy1, 0); output += 4; wasm_v128_store32_lane(output, vy2, 0); output += 4; wasm_v128_store32_lane(output, vy3, 0); output += 4; } for (; batch >= 4 * sizeof(int16_t); batch -= 4 * sizeof(int16_t)) { const v128_t vx = wasm_i32x4_load16x4(input); input += 4; v128_t vacclo = wasm_i64x2_extmul_low_i32x4(vx, vmultiplier); v128_t vacchi = wasm_i64x2_extmul_high_i32x4(vx, vmultiplier); vacclo = wasm_i64x2_add(vacclo, vbias); vacchi = wasm_i64x2_add(vacchi, vbias); vacclo = wasm_i64x2_shr(vacclo, 16); vacchi = wasm_i64x2_shr(vacchi, 16); v128_t vacc = wasm_v32x4_shuffle(vacclo, vacchi, 0, 2, 4, 6); vacc = wasm_i16x8_narrow_i32x4(vacc, vacc); const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); wasm_v128_store32_lane(output, vy, 0); output += 4; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int16_t)); assert(batch <= 3 * sizeof(int16_t)); const v128_t vx = wasm_i32x4_load16x4(input); v128_t vacclo = wasm_i64x2_extmul_low_i32x4(vx, vmultiplier); v128_t vacchi = wasm_i64x2_extmul_high_i32x4(vx, vmultiplier); vacclo = wasm_i64x2_add(vacclo, vbias); vacchi = wasm_i64x2_add(vacchi, vbias); vacclo = wasm_i64x2_shr(vacclo, 16); vacchi = wasm_i64x2_shr(vacchi, 16); v128_t vacc = wasm_v32x4_shuffle(vacclo, vacchi, 0, 2, 4, 6); vacc = wasm_i16x8_narrow_i32x4(vacc, vacc); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); if (batch & (2 * sizeof(int16_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(int16_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qs8_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t))); v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))); const v128_t vi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); const v128_t vk0x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))); i0 += 16; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))); const v128_t vi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); const v128_t vk1x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))); i1 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))); const v128_t vi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); const v128_t vk2x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))); i2 += 16; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))); const v128_t vi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); const v128_t vk3x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))); i3 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))); const v128_t vi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); const v128_t vk4x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))); i4 += 16; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))); const v128_t vi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); const v128_t vk5x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))); i5 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))); const v128_t vi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); const v128_t vk6x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))); i6 += 16; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))); const v128_t vi7x89ABCDEF = wasm_i16x8_load8x8(i7 + 8); const v128_t vk7x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))); i7 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))); const v128_t vi8x89ABCDEF = wasm_i16x8_load8x8(i8 + 8); const v128_t vk8x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))); i8 += 16; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF); const v128_t vi9x01234567 = wasm_i16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))); const v128_t vi9x89ABCDEF = wasm_i16x8_load8x8(i9 + 8); const v128_t vk9x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))); i9 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi9x01234567, vk9x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi9x89ABCDEF, vk9x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi10x01234567 = wasm_i16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))); const v128_t vi10x89ABCDEF = wasm_i16x8_load8x8(i10 + 8); const v128_t vk10x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))); i10 += 16; vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi10x89ABCDEF, vk10x89ABCDEF); const v128_t vi11x01234567 = wasm_i16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))); const v128_t vi11x89ABCDEF = wasm_i16x8_load8x8(i11 + 8); const v128_t vk11x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))); i11 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi11x01234567, vk11x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi11x89ABCDEF, vk11x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi12x01234567 = wasm_i16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))); const v128_t vi12x89ABCDEF = wasm_i16x8_load8x8(i12 + 8); const v128_t vk12x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))); i12 += 16; vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi12x89ABCDEF, vk12x89ABCDEF); const v128_t vi13x01234567 = wasm_i16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))); const v128_t vi13x89ABCDEF = wasm_i16x8_load8x8(i13 + 8); const v128_t vk13x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))); i13 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi13x01234567, vk13x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi13x89ABCDEF, vk13x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi14x01234567 = wasm_i16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))); const v128_t vi14x89ABCDEF = wasm_i16x8_load8x8(i14 + 8); const v128_t vk14x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))); i14 += 16; vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi14x89ABCDEF, vk14x89ABCDEF); const v128_t vi15x01234567 = wasm_i16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))); const v128_t vi15x89ABCDEF = wasm_i16x8_load8x8(i15 + 8); const v128_t vk15x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))); i15 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi15x01234567, vk15x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi15x89ABCDEF, vk15x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi16x01234567 = wasm_i16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))); const v128_t vi16x89ABCDEF = wasm_i16x8_load8x8(i16 + 8); const v128_t vk16x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))); i16 += 16; vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi16x89ABCDEF, vk16x89ABCDEF); const v128_t vi17x01234567 = wasm_i16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))); const v128_t vi17x89ABCDEF = wasm_i16x8_load8x8(i17 + 8); const v128_t vk17x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))); i17 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi17x01234567, vk17x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi17x89ABCDEF, vk17x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi18x01234567 = wasm_i16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))); const v128_t vi18x89ABCDEF = wasm_i16x8_load8x8(i18 + 8); const v128_t vk18x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))); i18 += 16; vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi18x89ABCDEF, vk18x89ABCDEF); const v128_t vi19x01234567 = wasm_i16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))); const v128_t vi19x89ABCDEF = wasm_i16x8_load8x8(i19 + 8); const v128_t vk19x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))); i19 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi19x01234567, vk19x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi19x89ABCDEF, vk19x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi20x01234567 = wasm_i16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))); const v128_t vi20x89ABCDEF = wasm_i16x8_load8x8(i20 + 8); const v128_t vk20x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))); i20 += 16; vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi20x89ABCDEF, vk20x89ABCDEF); const v128_t vi21x01234567 = wasm_i16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))); const v128_t vi21x89ABCDEF = wasm_i16x8_load8x8(i21 + 8); const v128_t vk21x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))); i21 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi21x01234567, vk21x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi21x89ABCDEF, vk21x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi22x01234567 = wasm_i16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))); const v128_t vi22x89ABCDEF = wasm_i16x8_load8x8(i22 + 8); const v128_t vk22x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))); i22 += 16; vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi22x89ABCDEF, vk22x89ABCDEF); const v128_t vi23x01234567 = wasm_i16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))); const v128_t vi23x89ABCDEF = wasm_i16x8_load8x8(i23 + 8); const v128_t vk23x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))); i23 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi23x01234567, vk23x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi23x89ABCDEF, vk23x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi24x01234567 = wasm_i16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))); const v128_t vi24x89ABCDEF = wasm_i16x8_load8x8(i24 + 8); const v128_t vk24x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))); i24 += 16; vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi24x89ABCDEF, vk24x89ABCDEF); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((uintptr_t) w + 16 * sizeof(int32_t)); do { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8(k); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) (k + 16)); i1 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) (k + 32)); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) (k + 48)); i3 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) (k + 64)); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) (k + 80)); i5 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) (k + 96)); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) (k + 112)); i7 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) (k + 128)); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); const v128_t vi9x01234567 = wasm_i16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_i16x8_load8x8((const void*) (k + 144)); i9 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi9x01234567, vk9x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi10x01234567 = wasm_i16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_i16x8_load8x8((const void*) (k + 160)); i10 += 8; vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); const v128_t vi11x01234567 = wasm_i16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_i16x8_load8x8((const void*) (k + 176)); i11 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi11x01234567, vk11x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi12x01234567 = wasm_i16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_i16x8_load8x8((const void*) (k + 192)); i12 += 8; vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); const v128_t vi13x01234567 = wasm_i16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_i16x8_load8x8((const void*) (k + 208)); i13 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi13x01234567, vk13x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi14x01234567 = wasm_i16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_i16x8_load8x8((const void*) (k + 224)); i14 += 8; vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); const v128_t vi15x01234567 = wasm_i16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_i16x8_load8x8((const void*) (k + 240)); i15 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi15x01234567, vk15x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi16x01234567 = wasm_i16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_i16x8_load8x8((const void*) (k + 256)); i16 += 8; vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); const v128_t vi17x01234567 = wasm_i16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_i16x8_load8x8((const void*) (k + 272)); i17 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi17x01234567, vk17x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi18x01234567 = wasm_i16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_i16x8_load8x8((const void*) (k + 288)); i18 += 8; vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); const v128_t vi19x01234567 = wasm_i16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_i16x8_load8x8((const void*) (k + 304)); i19 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi19x01234567, vk19x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi20x01234567 = wasm_i16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_i16x8_load8x8((const void*) (k + 320)); i20 += 8; vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); const v128_t vi21x01234567 = wasm_i16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_i16x8_load8x8((const void*) (k + 336)); i21 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi21x01234567, vk21x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi22x01234567 = wasm_i16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_i16x8_load8x8((const void*) (k + 352)); i22 += 8; vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); const v128_t vi23x01234567 = wasm_i16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_i16x8_load8x8((const void*) (k + 368)); i23 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi23x01234567, vk23x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi24x01234567 = wasm_i16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_i16x8_load8x8((const void*) (k + 384)); i24 += 8; vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); k += 8; vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); if XNN_LIKELY(c >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; c -= 8; } else { if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t))); v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))); const v128_t vi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); const v128_t vk0x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))); i0 += 16; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))); const v128_t vi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); const v128_t vk1x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))); i1 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))); const v128_t vi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); const v128_t vk2x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))); i2 += 16; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))); const v128_t vi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); const v128_t vk3x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))); i3 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))); const v128_t vi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); const v128_t vk4x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))); i4 += 16; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))); const v128_t vi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); const v128_t vk5x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))); i5 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))); const v128_t vi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); const v128_t vk6x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))); i6 += 16; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))); const v128_t vi7x89ABCDEF = wasm_i16x8_load8x8(i7 + 8); const v128_t vk7x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))); i7 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))); const v128_t vi8x89ABCDEF = wasm_i16x8_load8x8(i8 + 8); const v128_t vk8x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))); i8 += 16; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((uintptr_t) w + 16 * sizeof(int32_t)); do { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8(k); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) (k + 16)); i1 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) (k + 32)); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) (k + 48)); i3 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) (k + 64)); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) (k + 80)); i5 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) (k + 96)); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) (k + 112)); i7 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) (k + 128)); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); k += 8; vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); if XNN_LIKELY(c >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; c -= 8; } else { if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32( size_t batch, const int8_t* input, float* output, const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vminus_zero_point = wasm_v128_load64_splat(params->wasmsimd.minus_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->wasmsimd.scale); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { v128_t vx01234567 = wasm_i16x8_load8x8(input); v128_t vx89ABCDEF = wasm_i16x8_load8x8(input + 8); v128_t vxGHIJKLMN = wasm_i16x8_load8x8(input + 16); v128_t vxOPQRSTUV = wasm_i16x8_load8x8(input + 24); input += 32; vx01234567 = wasm_i16x8_add(vx01234567, vminus_zero_point); vx89ABCDEF = wasm_i16x8_add(vx89ABCDEF, vminus_zero_point); vxGHIJKLMN = wasm_i16x8_add(vxGHIJKLMN, vminus_zero_point); vxOPQRSTUV = wasm_i16x8_add(vxOPQRSTUV, vminus_zero_point); v128_t vy0123 = wasm_i32x4_extend_low_i16x8(vx01234567); v128_t vy4567 = wasm_i32x4_extend_high_i16x8(vx01234567); v128_t vy89AB = wasm_i32x4_extend_low_i16x8(vx89ABCDEF); v128_t vyCDEF = wasm_i32x4_extend_high_i16x8(vx89ABCDEF); v128_t vyGHIJ = wasm_i32x4_extend_low_i16x8(vxGHIJKLMN); v128_t vyKLMN = wasm_i32x4_extend_high_i16x8(vxGHIJKLMN); v128_t vyOPQR = wasm_i32x4_extend_low_i16x8(vxOPQRSTUV); v128_t vySTUV = wasm_i32x4_extend_high_i16x8(vxOPQRSTUV); vy0123 = wasm_f32x4_convert_i32x4(vy0123); vy4567 = wasm_f32x4_convert_i32x4(vy4567); vy89AB = wasm_f32x4_convert_i32x4(vy89AB); vyCDEF = wasm_f32x4_convert_i32x4(vyCDEF); vyGHIJ = wasm_f32x4_convert_i32x4(vyGHIJ); vyKLMN = wasm_f32x4_convert_i32x4(vyKLMN); vyOPQR = wasm_f32x4_convert_i32x4(vyOPQR); vySTUV = wasm_f32x4_convert_i32x4(vySTUV); vy0123 = wasm_f32x4_mul(vy0123, vscale); vy4567 = wasm_f32x4_mul(vy4567, vscale); vy89AB = wasm_f32x4_mul(vy89AB, vscale); vyCDEF = wasm_f32x4_mul(vyCDEF, vscale); vyGHIJ = wasm_f32x4_mul(vyGHIJ, vscale); vyKLMN = wasm_f32x4_mul(vyKLMN, vscale); vyOPQR = wasm_f32x4_mul(vyOPQR, vscale); vySTUV = wasm_f32x4_mul(vySTUV, vscale); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); wasm_v128_store(output + 16, vyGHIJ); wasm_v128_store(output + 20, vyKLMN); wasm_v128_store(output + 24, vyOPQR); wasm_v128_store(output + 28, vySTUV); output += 32; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { v128_t vx = wasm_i16x8_load8x8(input); vx = wasm_i16x8_add(vx, vminus_zero_point); input += 8; v128_t vy_lo = wasm_i32x4_extend_low_i16x8(vx); v128_t vy_hi = wasm_i32x4_extend_high_i16x8(vx); vy_lo = wasm_f32x4_convert_i32x4(vy_lo); vy_hi = wasm_f32x4_convert_i32x4(vy_hi); vy_lo = wasm_f32x4_mul(vy_lo, vscale); vy_hi = wasm_f32x4_mul(vy_hi, vscale); wasm_v128_store(output, vy_lo); wasm_v128_store(output + 4, vy_hi); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); v128_t vx = wasm_i16x8_load8x8(input); vx = wasm_i16x8_add(vx, vminus_zero_point); input += 8; v128_t vy = wasm_i32x4_extend_low_i16x8(vx); vy = wasm_f32x4_convert_i32x4(vy); vy = wasm_f32x4_mul(vy, vscale); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store(output, vy); output += 4; vy = wasm_i32x4_extend_high_i16x8(vx); vy = wasm_f32x4_convert_i32x4(vy); vy = wasm_f32x4_mul(vy, vscale); } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16( size_t rows, size_t channels, const int8_t* input, size_t input_stride, const int8_t* zero, int32_t* buffer, int8_t* output, const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const int8_t* i0 = input; const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 16)) { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); wasm_v128_store(b, vacc0123); wasm_v128_store(b + 4, vacc4567); wasm_v128_store(b + 8, vacc89AB); wasm_v128_store(b + 12, vaccCDEF); b += 16; } for (rows -= 7; rows > 7; rows -= 7) { i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 16)) { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_v128_load(b); v128_t vacc4567 = wasm_v128_load(b + 4); v128_t vacc89AB = wasm_v128_load(b + 8); v128_t vaccCDEF = wasm_v128_load(b + 12); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); wasm_v128_store(b, vacc0123); wasm_v128_store(b + 4, vacc4567); wasm_v128_store(b + 8, vacc89AB); wasm_v128_store(b + 12, vaccCDEF); b += 16; } } i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; channels >= 16; channels -= 16) { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_v128_load(buffer); v128_t vacc4567 = wasm_v128_load(buffer + 4); v128_t vacc89AB = wasm_v128_load(buffer + 8); v128_t vaccCDEF = wasm_v128_load(buffer + 12); buffer += 16; vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(channels != 0) { do { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); i0 += 8; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); i1 += 8; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); i2 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); i3 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); i4 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); i5 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); i6 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); v128_t vacc0123 = wasm_v128_load(buffer); v128_t vacc4567 = wasm_v128_load(buffer + 4); buffer += 8; vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(channels >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; channels -= 8; } else { if (channels & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (channels & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (channels & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } channels = 0; } } while (channels != 0); } } void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16( size_t rows, size_t channels, const int8_t* input, size_t input_stride, const int8_t* zero, int8_t* output, const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const int8_t* i0 = input; const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; channels >= 16; channels -= 16) { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(channels != 0) { do { const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); i0 += 8; const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); i1 += 8; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); i2 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); i3 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); i4 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); i5 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); i6 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(channels >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; channels -= 8; } else { if (channels & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (channels & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (channels & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } channels = 0; } } while (channels != 0); } } void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vacc = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_i8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; v128_t vxa1 = wasm_i16x8_load8x8((const v128_t*) a1); a1 += 8; v128_t vxa2 = wasm_i16x8_load8x8((const v128_t*) a2); a2 += 8; v128_t vxa3 = wasm_i16x8_load8x8((const v128_t*) a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vacc = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_i8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); wasm_v128_store32_lane(c1, vacc, 1); wasm_v128_store32_lane(c2, vacc, 2); wasm_v128_store32_lane(c3, vacc, 3); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; wasm_v128_store16_lane(c1, vacc, 2); c1 += 2; wasm_v128_store16_lane(c2, vacc, 4); c2 += 2; wasm_v128_store16_lane(c3, vacc, 6); c3 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); wasm_v128_store8_lane(c1, vacc, 4); wasm_v128_store8_lane(c2, vacc, 8); wasm_v128_store8_lane(c3, vacc, 12); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8(a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); p -= 1 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_i8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c0, vout, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } const int8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); } const int8_t* restrict a2 = a[2]; if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); } const int8_t* restrict a3 = a[3]; if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8(a0); a0 += 8; v128_t vxa1 = wasm_i16x8_load8x8(a1); a1 += 8; v128_t vxa2 = wasm_i16x8_load8x8(a2); a2 += 8; v128_t vxa3 = wasm_i16x8_load8x8(a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); p -= 4 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_i8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c3, vout, 3); wasm_v128_store32_lane(c2, vout, 2); wasm_v128_store32_lane(c1, vout, 1); wasm_v128_store32_lane(c0, vout, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c3, vout, 6); c3 += 2; wasm_v128_store16_lane(c2, vout, 4); c2 += 2; wasm_v128_store16_lane(c1, vout, 2); c1 += 2; wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c3, vout, 12); wasm_v128_store8_lane(c2, vout, 8); wasm_v128_store8_lane(c1, vout, 4); wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p16c__wasmsimd_mul16_add16( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } const int8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const int8_t*) ((uintptr_t) i9 + input_offset); } const int8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const int8_t*) ((uintptr_t) i10 + input_offset); } const int8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const int8_t*) ((uintptr_t) i11 + input_offset); } const int8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const int8_t*) ((uintptr_t) i12 + input_offset); } const int8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const int8_t*) ((uintptr_t) i13 + input_offset); } const int8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const int8_t*) ((uintptr_t) i14 + input_offset); } const int8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const int8_t*) ((uintptr_t) i15 + input_offset); } const int8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const int8_t*) ((uintptr_t) i16 + input_offset); } const int8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const int8_t*) ((uintptr_t) i17 + input_offset); } const int8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const int8_t*) ((uintptr_t) i18 + input_offset); } const int8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const int8_t*) ((uintptr_t) i19 + input_offset); } const int8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const int8_t*) ((uintptr_t) i20 + input_offset); } const int8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const int8_t*) ((uintptr_t) i21 + input_offset); } const int8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const int8_t*) ((uintptr_t) i22 + input_offset); } const int8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const int8_t*) ((uintptr_t) i23 + input_offset); } const int8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const int8_t*) ((uintptr_t) i24 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t))); v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))); const v128_t vi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); const v128_t vk0x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))); i0 += 16; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))); const v128_t vi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); const v128_t vk1x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))); i1 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))); const v128_t vi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); const v128_t vk2x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))); i2 += 16; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))); const v128_t vi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); const v128_t vk3x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))); i3 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))); const v128_t vi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); const v128_t vk4x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))); i4 += 16; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))); const v128_t vi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); const v128_t vk5x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))); i5 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))); const v128_t vi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); const v128_t vk6x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))); i6 += 16; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))); const v128_t vi7x89ABCDEF = wasm_i16x8_load8x8(i7 + 8); const v128_t vk7x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))); i7 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))); const v128_t vi8x89ABCDEF = wasm_i16x8_load8x8(i8 + 8); const v128_t vk8x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))); i8 += 16; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF); const v128_t vi9x01234567 = wasm_i16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))); const v128_t vi9x89ABCDEF = wasm_i16x8_load8x8(i9 + 8); const v128_t vk9x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t))); i9 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi9x01234567, vk9x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi9x89ABCDEF, vk9x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi10x01234567 = wasm_i16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t))); const v128_t vi10x89ABCDEF = wasm_i16x8_load8x8(i10 + 8); const v128_t vk10x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t))); i10 += 16; vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi10x89ABCDEF, vk10x89ABCDEF); const v128_t vi11x01234567 = wasm_i16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t))); const v128_t vi11x89ABCDEF = wasm_i16x8_load8x8(i11 + 8); const v128_t vk11x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t))); i11 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi11x01234567, vk11x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi11x89ABCDEF, vk11x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi12x01234567 = wasm_i16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t))); const v128_t vi12x89ABCDEF = wasm_i16x8_load8x8(i12 + 8); const v128_t vk12x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t))); i12 += 16; vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi12x89ABCDEF, vk12x89ABCDEF); const v128_t vi13x01234567 = wasm_i16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t))); const v128_t vi13x89ABCDEF = wasm_i16x8_load8x8(i13 + 8); const v128_t vk13x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t))); i13 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi13x01234567, vk13x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi13x89ABCDEF, vk13x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi14x01234567 = wasm_i16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t))); const v128_t vi14x89ABCDEF = wasm_i16x8_load8x8(i14 + 8); const v128_t vk14x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t))); i14 += 16; vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi14x89ABCDEF, vk14x89ABCDEF); const v128_t vi15x01234567 = wasm_i16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t))); const v128_t vi15x89ABCDEF = wasm_i16x8_load8x8(i15 + 8); const v128_t vk15x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t))); i15 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi15x01234567, vk15x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi15x89ABCDEF, vk15x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi16x01234567 = wasm_i16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t))); const v128_t vi16x89ABCDEF = wasm_i16x8_load8x8(i16 + 8); const v128_t vk16x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t))); i16 += 16; vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi16x89ABCDEF, vk16x89ABCDEF); const v128_t vi17x01234567 = wasm_i16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t))); const v128_t vi17x89ABCDEF = wasm_i16x8_load8x8(i17 + 8); const v128_t vk17x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t))); i17 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi17x01234567, vk17x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi17x89ABCDEF, vk17x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi18x01234567 = wasm_i16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t))); const v128_t vi18x89ABCDEF = wasm_i16x8_load8x8(i18 + 8); const v128_t vk18x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t))); i18 += 16; vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi18x89ABCDEF, vk18x89ABCDEF); const v128_t vi19x01234567 = wasm_i16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t))); const v128_t vi19x89ABCDEF = wasm_i16x8_load8x8(i19 + 8); const v128_t vk19x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t))); i19 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi19x01234567, vk19x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi19x89ABCDEF, vk19x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi20x01234567 = wasm_i16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t))); const v128_t vi20x89ABCDEF = wasm_i16x8_load8x8(i20 + 8); const v128_t vk20x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t))); i20 += 16; vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi20x89ABCDEF, vk20x89ABCDEF); const v128_t vi21x01234567 = wasm_i16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t))); const v128_t vi21x89ABCDEF = wasm_i16x8_load8x8(i21 + 8); const v128_t vk21x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t))); i21 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi21x01234567, vk21x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi21x89ABCDEF, vk21x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi22x01234567 = wasm_i16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t))); const v128_t vi22x89ABCDEF = wasm_i16x8_load8x8(i22 + 8); const v128_t vk22x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t))); i22 += 16; vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi22x89ABCDEF, vk22x89ABCDEF); const v128_t vi23x01234567 = wasm_i16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t))); const v128_t vi23x89ABCDEF = wasm_i16x8_load8x8(i23 + 8); const v128_t vk23x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t))); i23 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi23x01234567, vk23x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi23x89ABCDEF, vk23x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi24x01234567 = wasm_i16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t))); const v128_t vi24x89ABCDEF = wasm_i16x8_load8x8(i24 + 8); const v128_t vk24x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t))); i24 += 16; vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi24x89ABCDEF, vk24x89ABCDEF); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); const v128_t vscale0123 = wasm_v128_load(w); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); const v128_t vscale89AB = wasm_v128_load((const float*) w + 8); const v128_t vscaleCDEF = wasm_v128_load((const float*) w + 12); w = (const void*) ((const float*) w + 16); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale89AB); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscaleCDEF); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((uintptr_t) w + 16 * sizeof(int32_t)); do { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8(k); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) (k + 16)); i1 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) (k + 32)); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) (k + 48)); i3 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) (k + 64)); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) (k + 80)); i5 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) (k + 96)); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) (k + 112)); i7 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) (k + 128)); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); const v128_t vi9x01234567 = wasm_i16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_i16x8_load8x8((const void*) (k + 144)); i9 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi9x01234567, vk9x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi10x01234567 = wasm_i16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_i16x8_load8x8((const void*) (k + 160)); i10 += 8; vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); const v128_t vi11x01234567 = wasm_i16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_i16x8_load8x8((const void*) (k + 176)); i11 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi11x01234567, vk11x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi12x01234567 = wasm_i16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_i16x8_load8x8((const void*) (k + 192)); i12 += 8; vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); const v128_t vi13x01234567 = wasm_i16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_i16x8_load8x8((const void*) (k + 208)); i13 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi13x01234567, vk13x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi14x01234567 = wasm_i16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_i16x8_load8x8((const void*) (k + 224)); i14 += 8; vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); const v128_t vi15x01234567 = wasm_i16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_i16x8_load8x8((const void*) (k + 240)); i15 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi15x01234567, vk15x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi16x01234567 = wasm_i16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_i16x8_load8x8((const void*) (k + 256)); i16 += 8; vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); const v128_t vi17x01234567 = wasm_i16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_i16x8_load8x8((const void*) (k + 272)); i17 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi17x01234567, vk17x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi18x01234567 = wasm_i16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_i16x8_load8x8((const void*) (k + 288)); i18 += 8; vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); const v128_t vi19x01234567 = wasm_i16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_i16x8_load8x8((const void*) (k + 304)); i19 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi19x01234567, vk19x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi20x01234567 = wasm_i16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_i16x8_load8x8((const void*) (k + 320)); i20 += 8; vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); const v128_t vi21x01234567 = wasm_i16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_i16x8_load8x8((const void*) (k + 336)); i21 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi21x01234567, vk21x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi22x01234567 = wasm_i16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_i16x8_load8x8((const void*) (k + 352)); i22 += 8; vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); const v128_t vi23x01234567 = wasm_i16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_i16x8_load8x8((const void*) (k + 368)); i23 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi23x01234567, vk23x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi24x01234567 = wasm_i16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_i16x8_load8x8((const void*) (k + 384)); i24 += 8; vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); k += 8; vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale0123 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t))); const v128_t vscale4567 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t) + 4 * sizeof(float))); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); if XNN_LIKELY(c >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; c -= 8; } else { if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__wasmsimd_mul16_add16( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t))); v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))); const v128_t vi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); const v128_t vk0x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))); i0 += 16; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))); const v128_t vi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); const v128_t vk1x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))); i1 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))); const v128_t vi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); const v128_t vk2x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))); i2 += 16; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); const v128_t vscale0123 = wasm_v128_load(w); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); const v128_t vscale89AB = wasm_v128_load((const float*) w + 8); const v128_t vscaleCDEF = wasm_v128_load((const float*) w + 12); w = (const void*) ((const float*) w + 16); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale89AB); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscaleCDEF); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((uintptr_t) w + 16 * sizeof(int32_t)); do { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8(k); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) (k + 16)); i1 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) (k + 32)); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); k += 8; vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale0123 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))); const v128_t vscale4567 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t) + 4 * sizeof(float))); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); if XNN_LIKELY(c >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; c -= 8; } else { if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p16c__wasmsimd_mul16_add16( size_t channels, size_t output_width, const int8_t** input, const void* weights, int8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); do { const int8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); } const int8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); } const int8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); } const int8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); } const int8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); } const int8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); } const int8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); } const int8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); } const int8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); } input = (const int8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 16; c -= 16) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t))); v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t))); const v128_t vi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); const v128_t vk0x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t))); i0 += 16; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t))); const v128_t vi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); const v128_t vk1x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t))); i1 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t))); const v128_t vi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); const v128_t vk2x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t))); i2 += 16; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t))); const v128_t vi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); const v128_t vk3x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t))); i3 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t))); const v128_t vi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); const v128_t vk4x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t))); i4 += 16; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t))); const v128_t vi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); const v128_t vk5x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t))); i5 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t))); const v128_t vi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); const v128_t vk6x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t))); i6 += 16; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t))); const v128_t vi7x89ABCDEF = wasm_i16x8_load8x8(i7 + 8); const v128_t vk7x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t))); i7 += 16; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vprod89ABCDEF = wasm_i16x8_add(vprod89ABCDEF, wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t))); const v128_t vi8x89ABCDEF = wasm_i16x8_load8x8(i8 + 8); const v128_t vk8x89ABCDEF = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t))); i8 += 16; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vprod89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vprod89ABCDEF)); w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); const v128_t vscale0123 = wasm_v128_load(w); const v128_t vscale4567 = wasm_v128_load((const float*) w + 4); const v128_t vscale89AB = wasm_v128_load((const float*) w + 8); const v128_t vscaleCDEF = wasm_v128_load((const float*) w + 12); w = (const void*) ((const float*) w + 16); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale89AB); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscaleCDEF); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(c != 0) { const int8_t* k = (const int8_t*) ((uintptr_t) w + 16 * sizeof(int32_t)); do { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_i16x8_load8x8(k); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) (k + 16)); i1 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi1x01234567, vk1x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) (k + 32)); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) (k + 48)); i3 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi3x01234567, vk3x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) (k + 64)); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) (k + 80)); i5 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi5x01234567, vk5x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) (k + 96)); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) (k + 112)); i7 += 8; vprod01234567 = wasm_i16x8_add(vprod01234567, wasm_i16x8_mul(vi7x01234567, vk7x01234567)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) (k + 128)); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod01234567)); k += 8; vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale0123 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t))); const v128_t vscale4567 = wasm_v128_load((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t) + 4 * sizeof(float))); vacc0123 = wasm_f32x4_mul(vacc0123, vscale0123); vacc4567 = wasm_f32x4_mul(vacc4567, vscale4567); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t)); if XNN_LIKELY(c >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; c -= 8; } else { if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } c = 0; } } while (c != 0); } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale0123 = wasm_v128_load(w); w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vacc = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_i8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const int8_t* restrict a, size_t a_stride, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const int8_t* a0 = a; int8_t* c0 = c; const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const int32_t*) w + 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8((const v128_t*) a0); a0 += 8; v128_t vxa1 = wasm_i16x8_load8x8((const v128_t*) a1); a1 += 8; v128_t vxa2 = wasm_i16x8_load8x8((const v128_t*) a2); a2 += 8; v128_t vxa3 = wasm_i16x8_load8x8((const v128_t*) a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale0123 = wasm_v128_load(w); w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vacc = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_i8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); wasm_v128_store32_lane(c1, vacc, 1); wasm_v128_store32_lane(c2, vacc, 2); wasm_v128_store32_lane(c3, vacc, 3); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); a0 = (const int8_t*) ((uintptr_t) a0 - kc); a1 = (const int8_t*) ((uintptr_t) a1 - kc); a2 = (const int8_t*) ((uintptr_t) a2 - kc); a3 = (const int8_t*) ((uintptr_t) a3 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; wasm_v128_store16_lane(c1, vacc, 2); c1 += 2; wasm_v128_store16_lane(c2, vacc, 4); c2 += 2; wasm_v128_store16_lane(c3, vacc, 6); c3 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); wasm_v128_store8_lane(c1, vacc, 4); wasm_v128_store8_lane(c2, vacc, 8); wasm_v128_store8_lane(c3, vacc, 12); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } a += 1; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8(a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); p -= 1 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale0123 = wasm_v128_load(w); w = (const void*) ((const float*) w + 4); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_i8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c0, vout, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const int8_t** restrict a, const void* restrict w, int8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const int8_t* zero, const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(int8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); int8_t* c0 = c; int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(int8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const int8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); } const int8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); } const int8_t* restrict a2 = a[2]; if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); } const int8_t* restrict a3 = a[3]; if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); } a += 4; size_t k = kc; do { v128_t vxa0 = wasm_i16x8_load8x8(a0); a0 += 8; v128_t vxa1 = wasm_i16x8_load8x8(a1); a1 += 8; v128_t vxa2 = wasm_i16x8_load8x8(a2); a2 += 8; v128_t vxa3 = wasm_i16x8_load8x8(a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_extend_low_i8x16(vb01); const v128_t vxb1 = wasm_i16x8_extend_high_i8x16(vb01); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const int8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_extend_low_i8x16(vb23); const v128_t vxb3 = wasm_i16x8_extend_high_i8x16(vb23); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const int8_t*) w + 32; k -= 8 * sizeof(int8_t); } while (k != 0); p -= 4 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale0123 = wasm_v128_load(w); w = (const void*) ((const float*) w + 4); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_i8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c3, vout, 3); wasm_v128_store32_lane(c2, vout, 2); wasm_v128_store32_lane(c1, vout, 1); wasm_v128_store32_lane(c0, vout, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); a = (const int8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c3, vout, 6); c3 += 2; wasm_v128_store16_lane(c2, vout, 4); c2 += 2; wasm_v128_store16_lane(c1, vout, 2); c1 += 2; wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c3, vout, 12); wasm_v128_store8_lane(c2, vout, 8); wasm_v128_store8_lane(c1, vout, 4); wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vbias = wasm_v128_load64_splat(params->wasmsimd.bias); const v128_t va_multiplier = wasm_v128_load64_splat(params->wasmsimd.a_multiplier); const v128_t vb_multiplier = wasm_v128_load64_splat(params->wasmsimd.b_multiplier); const uint32_t vshift = params->wasmsimd.shift; const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.output_min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.output_max); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t vb01234567 = wasm_i16x8_load8x8(input_b); const v128_t va89ABCDEF = wasm_i16x8_load8x8(input_a + 8); const v128_t vb89ABCDEF = wasm_i16x8_load8x8(input_b + 8); const v128_t vaGHIJKLMN = wasm_i16x8_load8x8(input_a + 16); const v128_t vbGHIJKLMN = wasm_i16x8_load8x8(input_b + 16); const v128_t vaOPQRSTUV = wasm_i16x8_load8x8(input_a + 24); const v128_t vbOPQRSTUV = wasm_i16x8_load8x8(input_b + 24); input_a += 32; input_b += 32; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier)); v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va89ABCDEF), va_multiplier)); v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va89ABCDEF), va_multiplier)); v128_t vaccGHIJ = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccKLMN = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccOPQR = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaOPQRSTUV), va_multiplier)); v128_t vaccSTUV = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaOPQRSTUV), va_multiplier)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb89ABCDEF), vb_multiplier)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb89ABCDEF), vb_multiplier)); vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vbGHIJKLMN), vb_multiplier)); vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vbGHIJKLMN), vb_multiplier)); vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vbOPQRSTUV), vb_multiplier)); vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vbOPQRSTUV), vb_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); vacc89AB = wasm_i32x4_shr(vacc89AB, vshift); vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift); vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift); vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift); vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift); vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point); v128_t voutGHIJKLMN = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN), voutput_zero_point); v128_t voutOPQRSTUV = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV), voutput_zero_point); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); v128_t voutGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); vout0123456789ABCDEF = wasm_i8x16_max(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = wasm_i8x16_max(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = wasm_i8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); output += 32; } if XNN_UNLIKELY(batch != 0) { do { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t vb01234567 = wasm_i16x8_load8x8(input_b); input_a += 8; input_b += 8; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_max(vout0123456701234567, voutput_min); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_multiplier = wasm_v128_load64_splat(params->wasmsimd.a_multiplier); const uint32_t vshift = params->wasmsimd.shift; const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.output_min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.output_max); v128_t vbias = wasm_i32x4_splat((int32_t) *input_b * params->wasmsimd.b_multiplier[0]); vbias = wasm_i32x4_add(vbias, wasm_v128_load64_splat(params->wasmsimd.bias)); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t va89ABCDEF = wasm_i16x8_load8x8(input_a + 8); const v128_t vaGHIJKLMN = wasm_i16x8_load8x8(input_a + 16); const v128_t vaOPQRSTUV = wasm_i16x8_load8x8(input_a + 24); input_a += 32; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier)); v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va89ABCDEF), va_multiplier)); v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va89ABCDEF), va_multiplier)); v128_t vaccGHIJ = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccKLMN = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccOPQR = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaOPQRSTUV), va_multiplier)); v128_t vaccSTUV = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaOPQRSTUV), va_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); vacc89AB = wasm_i32x4_shr(vacc89AB, vshift); vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift); vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift); vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift); vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift); vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point); v128_t voutGHIJKLMN = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN), voutput_zero_point); v128_t voutOPQRSTUV = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV), voutput_zero_point); v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); v128_t voutGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); vout0123456789ABCDEF = wasm_i8x16_max(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = wasm_i8x16_max(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = wasm_i8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); output += 32; } if XNN_UNLIKELY(batch != 0) { do { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); input_a += 8; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_max(vout0123456701234567, voutput_min); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(batch >= (8 * sizeof(int8_t))) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; batch -= 8 * sizeof(int8_t); } else { if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qs8_vcvt_ukernel__wasmsimd_x16( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point); const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { v128_t vacc0 = wasm_i16x8_load8x8(input); v128_t vacc1 = wasm_i16x8_load8x8(input + 8); input += 16; vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0); vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1); vacc0 = wasm_i16x8_shl(vacc0, 7); vacc1 = wasm_i16x8_shl(vacc1, 7); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1); wasm_v128_store(output, vy0); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { v128_t vacc = wasm_i16x8_load8x8(input); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vacc = wasm_i16x8_shl(vacc, 7); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); v128_t vacc = wasm_i16x8_load8x8(input); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vacc = wasm_i16x8_shl(vacc, 7); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qs8_vlrelu_ukernel__wasmsimd_arm_x32( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point); const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier); const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point); for (; batch >= 32 * sizeof(int8_t); batch -= 32 * sizeof(int8_t)) { v128_t vx0 = wasm_v128_load(input); v128_t vx1 = wasm_v128_load(input + 16); input += 32; v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_low_i8x16(vx0)); v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_high_i8x16(vx0)); v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15); v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15); v128_t vacc2 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_low_i8x16(vx1)); v128_t vacc3 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_high_i8x16(vx1)); v128_t vmultiplier2 = wasm_i16x8_shr(vacc2, 15); v128_t vmultiplier3 = wasm_i16x8_shr(vacc3, 15); vacc0 = wasm_i16x8_shl(vacc0, 7); vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0); vacc1 = wasm_i16x8_shl(vacc1, 7); vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1); vacc2 = wasm_i16x8_shl(vacc2, 7); vmultiplier2 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier2); vacc3 = wasm_i16x8_shl(vacc3, 7); vmultiplier3 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier3); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier0); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier1); vacc2 = wasm_i16x8_q15mulr_sat(vacc2, vmultiplier2); vacc3 = wasm_i16x8_q15mulr_sat(vacc3, vmultiplier3); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point); vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point); const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1); const v128_t vy1 = wasm_i8x16_narrow_i16x8(vacc2, vacc3); wasm_v128_store(output, vy0); wasm_v128_store((output + 16), vy1); output += 32; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const v128_t vx = wasm_i16x8_load8x8(input); v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx); v128_t vmultiplier = wasm_i16x8_shr(vacc, 15); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); const v128_t vx = wasm_i16x8_load8x8(input); v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx); v128_t vmultiplier = wasm_i16x8_shr(vacc, 15); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x16( size_t batch, const int8_t* input, int8_t* output, const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point); const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff); const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point); for (; batch >= 16 * sizeof(int8_t); batch -= 16 * sizeof(int8_t)) { v128_t vacc0 = wasm_i16x8_load8x8(input); v128_t vacc1 = wasm_i16x8_load8x8(input + 8); input += 16; v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point); vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0); v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point); vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1); vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff); vacc0 = wasm_i16x8_shl(vacc0, 7); vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base); vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff); vacc1 = wasm_i16x8_shl(vacc1, 7); vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier0); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier1); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1); wasm_v128_store(output, vy0); output += 16; } for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { v128_t vacc = wasm_i16x8_load8x8(input); v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(int8_t)); assert(batch <= 7 * sizeof(int8_t)); v128_t vacc = wasm_i16x8_load8x8(input); v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.a_zero_point); const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.b_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t vb01234567 = wasm_i16x8_load8x8(input_b); input_a += 8; input_b += 8; const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); const v128_t vxb01234567 = wasm_i16x8_sub(vb01234567, vb_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb01234567); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb01234567); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t vb01234567 = wasm_i16x8_load8x8(input_b); const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); const v128_t vxb01234567 = wasm_i16x8_sub(vb01234567, vb_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb01234567); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb01234567); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } } } } void xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8( size_t batch, const int8_t* input_a, const int8_t* input_b, int8_t* output, const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.a_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); const v128_t vxb = wasm_i16x8_sub( wasm_i16x8_splat((int16_t) *input_b), wasm_v128_load64_splat(params->fp32_wasmsimd.b_zero_point)); for (; batch >= 8 * sizeof(int8_t); batch -= 8 * sizeof(int8_t)) { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); input_a += 8; const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { { const v128_t va01234567 = wasm_i16x8_load8x8(input_a); const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); if (batch & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } } } } void xnn_qu8_dwconv_minmax_fp32_ukernel_25p8c__wasmsimd_mul16( size_t channels, size_t output_width, const uint8_t** input, const void* weights, uint8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vkernel_zero_point = wasm_u32x4_load16x4(params->fp32_wasmsimd.kernel_zero_point); do { const uint8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } const uint8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } const uint8_t* i9 = input[9]; assert(i9 != NULL); if XNN_UNPREDICTABLE(i9 != zero) { i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset); } const uint8_t* i10 = input[10]; assert(i10 != NULL); if XNN_UNPREDICTABLE(i10 != zero) { i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset); } const uint8_t* i11 = input[11]; assert(i11 != NULL); if XNN_UNPREDICTABLE(i11 != zero) { i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset); } const uint8_t* i12 = input[12]; assert(i12 != NULL); if XNN_UNPREDICTABLE(i12 != zero) { i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset); } const uint8_t* i13 = input[13]; assert(i13 != NULL); if XNN_UNPREDICTABLE(i13 != zero) { i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset); } const uint8_t* i14 = input[14]; assert(i14 != NULL); if XNN_UNPREDICTABLE(i14 != zero) { i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset); } const uint8_t* i15 = input[15]; assert(i15 != NULL); if XNN_UNPREDICTABLE(i15 != zero) { i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset); } const uint8_t* i16 = input[16]; assert(i16 != NULL); if XNN_UNPREDICTABLE(i16 != zero) { i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset); } const uint8_t* i17 = input[17]; assert(i17 != NULL); if XNN_UNPREDICTABLE(i17 != zero) { i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset); } const uint8_t* i18 = input[18]; assert(i18 != NULL); if XNN_UNPREDICTABLE(i18 != zero) { i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset); } const uint8_t* i19 = input[19]; assert(i19 != NULL); if XNN_UNPREDICTABLE(i19 != zero) { i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset); } const uint8_t* i20 = input[20]; assert(i20 != NULL); if XNN_UNPREDICTABLE(i20 != zero) { i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset); } const uint8_t* i21 = input[21]; assert(i21 != NULL); if XNN_UNPREDICTABLE(i21 != zero) { i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset); } const uint8_t* i22 = input[22]; assert(i22 != NULL); if XNN_UNPREDICTABLE(i22 != zero) { i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset); } const uint8_t* i23 = input[23]; assert(i23 != NULL); if XNN_UNPREDICTABLE(i23 != zero) { i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset); } const uint8_t* i24 = input[24]; assert(i24 != NULL); if XNN_UNPREDICTABLE(i24 != zero) { i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset); } input = (const uint8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t))); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t))); v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567); i1 += 8; vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567); i3 += 8; vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567); i5 += 8; vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567); i7 += 8; vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567); i9 += 8; vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567); i10 += 8; vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567); i11 += 8; vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567); i12 += 8; vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567); i13 += 8; vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567); i14 += 8; vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567); i15 += 8; vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567); i16 += 8; vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567); i17 += 8; vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567); i18 += 8; vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567); i19 += 8; vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567); i20 += 8; vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567); i21 += 8; vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567); i22 += 8; vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567); i23 += 8; vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567); i24 += 8; vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point)); vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point)); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(c != 0) { { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t))); v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t))); v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567); vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567); vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567); vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567); vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567); vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567); vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567); vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567); vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9); const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567); vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10); const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567); vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11); const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567); vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12); const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567); vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13); const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567); vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14); const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567); vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15); const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567); vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16); const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567); vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17); const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567); vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18); const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567); vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19); const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567); vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20); const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567); vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21); const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567); vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22); const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567); vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23); const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567); vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24); const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567); vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point)); vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qu8_dwconv_minmax_fp32_ukernel_9p8c__wasmsimd_mul16( size_t channels, size_t output_width, const uint8_t** input, const void* weights, uint8_t* output, intptr_t input_stride, size_t output_increment, size_t input_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(channels != 0); assert(output_width != 0); const v128_t vkernel_zero_point = wasm_u32x4_load16x4(params->fp32_wasmsimd.kernel_zero_point); do { const uint8_t* i0 = input[0]; assert(i0 != NULL); if XNN_UNPREDICTABLE(i0 != zero) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); } const uint8_t* i1 = input[1]; assert(i1 != NULL); if XNN_UNPREDICTABLE(i1 != zero) { i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); } const uint8_t* i2 = input[2]; assert(i2 != NULL); if XNN_UNPREDICTABLE(i2 != zero) { i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); } const uint8_t* i3 = input[3]; assert(i3 != NULL); if XNN_UNPREDICTABLE(i3 != zero) { i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); } const uint8_t* i4 = input[4]; assert(i4 != NULL); if XNN_UNPREDICTABLE(i4 != zero) { i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); } const uint8_t* i5 = input[5]; assert(i5 != NULL); if XNN_UNPREDICTABLE(i5 != zero) { i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); } const uint8_t* i6 = input[6]; assert(i6 != NULL); if XNN_UNPREDICTABLE(i6 != zero) { i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); } const uint8_t* i7 = input[7]; assert(i7 != NULL); if XNN_UNPREDICTABLE(i7 != zero) { i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); } const uint8_t* i8 = input[8]; assert(i8 != NULL); if XNN_UNPREDICTABLE(i8 != zero) { i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); } input = (const uint8_t**) ((uintptr_t) input + input_stride); size_t c = channels; const void* w = weights; for (; c >= 8; c -= 8) { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t))); i0 += 8; v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t))); v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567); i1 += 8; vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567); i2 += 8; vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567); i3 += 8; vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567); i4 += 8; vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567); i5 += 8; vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567); i6 += 8; vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567); i7 += 8; vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567); i8 += 8; vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point)); vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point)); w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(c != 0) { { v128_t vacc0123 = wasm_v128_load(w); v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t))); v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t))); v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567); vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2); const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567); vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3); const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567); vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4); const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567); vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5); const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567); vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6); const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567); vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7); const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567); vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8); const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t))); vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567); vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567)); vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point)); vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if (c & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (c & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (c & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_width != 0); } void xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32( size_t batch, const uint8_t* input, float* output, const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vminus_zero_point = wasm_v128_load64_splat(params->wasmsimd.minus_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->wasmsimd.scale); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { v128_t vx01234567 = wasm_u16x8_load8x8(input); v128_t vx89ABCDEF = wasm_u16x8_load8x8(input + 8); v128_t vxGHIJKLMN = wasm_u16x8_load8x8(input + 16); v128_t vxOPQRSTUV = wasm_u16x8_load8x8(input + 24); input += 32; vx01234567 = wasm_i16x8_add(vx01234567, vminus_zero_point); vx89ABCDEF = wasm_i16x8_add(vx89ABCDEF, vminus_zero_point); vxGHIJKLMN = wasm_i16x8_add(vxGHIJKLMN, vminus_zero_point); vxOPQRSTUV = wasm_i16x8_add(vxOPQRSTUV, vminus_zero_point); v128_t vy0123 = wasm_i32x4_extend_low_i16x8(vx01234567); v128_t vy4567 = wasm_i32x4_extend_high_i16x8(vx01234567); v128_t vy89AB = wasm_i32x4_extend_low_i16x8(vx89ABCDEF); v128_t vyCDEF = wasm_i32x4_extend_high_i16x8(vx89ABCDEF); v128_t vyGHIJ = wasm_i32x4_extend_low_i16x8(vxGHIJKLMN); v128_t vyKLMN = wasm_i32x4_extend_high_i16x8(vxGHIJKLMN); v128_t vyOPQR = wasm_i32x4_extend_low_i16x8(vxOPQRSTUV); v128_t vySTUV = wasm_i32x4_extend_high_i16x8(vxOPQRSTUV); vy0123 = wasm_f32x4_convert_i32x4(vy0123); vy4567 = wasm_f32x4_convert_i32x4(vy4567); vy89AB = wasm_f32x4_convert_i32x4(vy89AB); vyCDEF = wasm_f32x4_convert_i32x4(vyCDEF); vyGHIJ = wasm_f32x4_convert_i32x4(vyGHIJ); vyKLMN = wasm_f32x4_convert_i32x4(vyKLMN); vyOPQR = wasm_f32x4_convert_i32x4(vyOPQR); vySTUV = wasm_f32x4_convert_i32x4(vySTUV); vy0123 = wasm_f32x4_mul(vy0123, vscale); vy4567 = wasm_f32x4_mul(vy4567, vscale); vy89AB = wasm_f32x4_mul(vy89AB, vscale); vyCDEF = wasm_f32x4_mul(vyCDEF, vscale); vyGHIJ = wasm_f32x4_mul(vyGHIJ, vscale); vyKLMN = wasm_f32x4_mul(vyKLMN, vscale); vyOPQR = wasm_f32x4_mul(vyOPQR, vscale); vySTUV = wasm_f32x4_mul(vySTUV, vscale); wasm_v128_store(output, vy0123); wasm_v128_store(output + 4, vy4567); wasm_v128_store(output + 8, vy89AB); wasm_v128_store(output + 12, vyCDEF); wasm_v128_store(output + 16, vyGHIJ); wasm_v128_store(output + 20, vyKLMN); wasm_v128_store(output + 24, vyOPQR); wasm_v128_store(output + 28, vySTUV); output += 32; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { v128_t vx = wasm_u16x8_load8x8(input); vx = wasm_i16x8_add(vx, vminus_zero_point); input += 8; v128_t vy_lo = wasm_i32x4_extend_low_i16x8(vx); v128_t vy_hi = wasm_i32x4_extend_high_i16x8(vx); vy_lo = wasm_f32x4_convert_i32x4(vy_lo); vy_hi = wasm_f32x4_convert_i32x4(vy_hi); vy_lo = wasm_f32x4_mul(vy_lo, vscale); vy_hi = wasm_f32x4_mul(vy_hi, vscale); wasm_v128_store(output, vy_lo); wasm_v128_store(output + 4, vy_hi); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); v128_t vx = wasm_u16x8_load8x8(input); vx = wasm_i16x8_add(vx, vminus_zero_point); input += 8; v128_t vy = wasm_i32x4_extend_low_i16x8(vx); vy = wasm_f32x4_convert_i32x4(vy); vy = wasm_f32x4_mul(vy, vscale); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store(output, vy); output += 4; vy = wasm_i32x4_extend_high_i16x8(vx); vy = wasm_f32x4_convert_i32x4(vy); vy = wasm_f32x4_mul(vy, vscale); } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vy, 0); } } } void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16( size_t rows, size_t channels, const uint8_t* input, size_t input_stride, const uint8_t* zero, int32_t* buffer, uint8_t* output, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows > 7); assert(channels != 0); const uint8_t* i0 = input; const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 16)) { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); wasm_v128_store(b, vacc0123); wasm_v128_store(b + 4, vacc4567); wasm_v128_store(b + 8, vacc89AB); wasm_v128_store(b + 12, vaccCDEF); b += 16; } for (rows -= 7; rows > 7; rows -= 7) { i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); int32_t* b = buffer; size_t c = channels; for (; c != 0; c = doz(c, 16)) { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_v128_load(b); v128_t vacc4567 = wasm_v128_load(b + 4); v128_t vacc89AB = wasm_v128_load(b + 8); v128_t vaccCDEF = wasm_v128_load(b + 12); vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); wasm_v128_store(b, vacc0123); wasm_v128_store(b + 4, vacc4567); wasm_v128_store(b + 8, vacc89AB); wasm_v128_store(b + 12, vaccCDEF); b += 16; } } i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; channels >= 16; channels -= 16) { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_v128_load(buffer); v128_t vacc4567 = wasm_v128_load(buffer + 4); v128_t vacc89AB = wasm_v128_load(buffer + 8); v128_t vaccCDEF = wasm_v128_load(buffer + 12); buffer += 16; vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(channels != 0) { do { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); i0 += 8; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); i1 += 8; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); i2 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); i3 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); i4 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); i5 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); i6 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); v128_t vacc0123 = wasm_v128_load(buffer); v128_t vacc4567 = wasm_v128_load(buffer + 4); buffer += 8; vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(channels >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; channels -= 8; } else { if (channels & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (channels & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (channels & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } channels = 0; } } while (channels != 0); } } void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16( size_t rows, size_t channels, const uint8_t* input, size_t input_stride, const uint8_t* zero, uint8_t* output, const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(rows != 0); assert(rows <= 7); assert(channels != 0); const uint8_t* i0 = input; const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(rows < 2) { i1 = zero; } const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(rows <= 2) { i2 = zero; } const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(rows < 4) { i3 = zero; } const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(rows <= 4) { i4 = zero; } const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(rows < 6) { i5 = zero; } const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(rows <= 6) { i6 = zero; } const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; channels >= 16; channels -= 16) { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); i0 += 16; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); i1 += 16; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); i2 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); i3 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); i4 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); i5 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); i6 += 16; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); output += 16; } if XNN_UNLIKELY(channels != 0) { do { const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); i0 += 8; const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); i1 += 8; v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); i2 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); i3 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); i4 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); i5 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); i6 += 8; vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(channels >= 8) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; channels -= 8; } else { if (channels & 4) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (channels & 2) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (channels & 1) { wasm_v128_store8_lane(output, vout0123456701234567, 0); output += 1; } channels = 0; } } while (channels != 0); } } void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(uint8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const int32_t*) w + 4; const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.kernel_zero_point); size_t k = kc; do { v128_t vxa0 = wasm_u16x8_load8x8((const v128_t*) a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb01), vb_zero_point); const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const uint8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb23), vb_zero_point); const v128_t vxb3 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb23), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const uint8_t*) w + 32; k -= 8 * sizeof(uint8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vacc = wasm_u8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_u8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, const uint8_t* restrict a, size_t a_stride, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(kc % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); const uint8_t* a0 = a; uint8_t* c0 = c; const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride); uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { a1 = a0; c1 = c0; } const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride); uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { a2 = a1; c2 = c1; } const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride); uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { a3 = a2; c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(uint8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const int32_t*) w + 4; const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.kernel_zero_point); size_t k = kc; do { v128_t vxa0 = wasm_u16x8_load8x8((const v128_t*) a0); a0 += 8; v128_t vxa1 = wasm_u16x8_load8x8((const v128_t*) a1); a1 += 8; v128_t vxa2 = wasm_u16x8_load8x8((const v128_t*) a2); a2 += 8; v128_t vxa3 = wasm_u16x8_load8x8((const v128_t*) a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb01), vb_zero_point); const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const uint8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb23), vb_zero_point); const v128_t vxb3 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb23), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const uint8_t*) w + 32; k -= 8 * sizeof(uint8_t); } while (k != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vacc = wasm_u8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t vaccput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vacc = wasm_u8x16_min(vacc, vaccput_max); if XNN_LIKELY(nc >= 4) { wasm_v128_store32_lane(c0, vacc, 0); wasm_v128_store32_lane(c1, vacc, 1); wasm_v128_store32_lane(c2, vacc, 2); wasm_v128_store32_lane(c3, vacc, 3); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride); a0 = (const uint8_t*) ((uintptr_t) a0 - kc); a1 = (const uint8_t*) ((uintptr_t) a1 - kc); a2 = (const uint8_t*) ((uintptr_t) a2 - kc); a3 = (const uint8_t*) ((uintptr_t) a3 - kc); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vacc, 0); c0 += 2; wasm_v128_store16_lane(c1, vacc, 2); c1 += 2; wasm_v128_store16_lane(c2, vacc, 4); c2 += 2; wasm_v128_store16_lane(c3, vacc, 6); c3 += 2; vacc = wasm_u32x4_shr(vacc, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vacc, 0); wasm_v128_store8_lane(c1, vacc, 4); wasm_v128_store8_lane(c2, vacc, 8); wasm_v128_store8_lane(c3, vacc, 12); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 1); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (1 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; kc = round_up_po2(kc, 8 * sizeof(uint8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } a += 1; const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.kernel_zero_point); size_t k = kc; do { v128_t vxa0 = wasm_u16x8_load8x8(a0); a0 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb01), vb_zero_point); const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const uint8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb23), vb_zero_point); const v128_t vxb3 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb23), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); w = (const uint8_t*) w + 32; k -= 8 * sizeof(uint8_t); } while (k != 0); p -= 1 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); v128_t vacc00x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123); v128_t vout = wasm_u8x16_narrow_i16x8(vacc00x0123, vacc00x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_u8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c0, vout, 0); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128( size_t mr, size_t nc, size_t kc, size_t ks, const uint8_t** restrict a, const void* restrict w, uint8_t* restrict c, size_t cm_stride, size_t cn_stride, size_t a_offset, const uint8_t* zero, const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(mr != 0); assert(mr <= 4); assert(nc != 0); assert(kc != 0); assert(ks != 0); assert(ks % (4 * sizeof(void*)) == 0); assert(a_offset % sizeof(uint8_t) == 0); assert(a != NULL); assert(w != NULL); assert(c != NULL); uint8_t* c0 = c; uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride); if XNN_UNPREDICTABLE(mr < 2) { c1 = c0; } uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride); if XNN_UNPREDICTABLE(mr <= 2) { c2 = c1; } uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride); if XNN_UNPREDICTABLE(mr != 4) { c3 = c2; } kc = round_up_po2(kc, 8 * sizeof(uint8_t)); do { v128_t vacc0x0123 = wasm_v128_load(w); v128_t vacc1x0123 = vacc0x0123; v128_t vacc2x0123 = vacc0x0123; v128_t vacc3x0123 = vacc0x0123; w = (const void*) ((const int32_t*) w + 4); size_t p = ks; do { const uint8_t* restrict a0 = a[0]; if XNN_UNPREDICTABLE(a0 != zero) { a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset); } const uint8_t* restrict a1 = a[1]; if XNN_UNPREDICTABLE(a1 != zero) { a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset); } const uint8_t* restrict a2 = a[2]; if XNN_UNPREDICTABLE(a2 != zero) { a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset); } const uint8_t* restrict a3 = a[3]; if XNN_UNPREDICTABLE(a3 != zero) { a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset); } a += 4; const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.kernel_zero_point); size_t k = kc; do { v128_t vxa0 = wasm_u16x8_load8x8(a0); a0 += 8; v128_t vxa1 = wasm_u16x8_load8x8(a1); a1 += 8; v128_t vxa2 = wasm_u16x8_load8x8(a2); a2 += 8; v128_t vxa3 = wasm_u16x8_load8x8(a3); a3 += 8; const v128_t vb01 = wasm_v128_load(w); const v128_t vxb0 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb01), vb_zero_point); const v128_t vxb1 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb01), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb0)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb0)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb0)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb0)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb1)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb1)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb1)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb1)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); const v128_t vb23 = wasm_v128_load((const uint8_t*) w + 16); const v128_t vxb2 = wasm_i16x8_sub(wasm_u16x8_extend_low_u8x16(vb23), vb_zero_point); const v128_t vxb3 = wasm_i16x8_sub(wasm_u16x8_extend_high_u8x16(vb23), vb_zero_point); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb2)); vxa0 = wasm_v32x4_shuffle(vxa0, vxa0, 1, 2, 3, 4); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb2)); vxa1 = wasm_v32x4_shuffle(vxa1, vxa1, 1, 2, 3, 4); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb2)); vxa2 = wasm_v32x4_shuffle(vxa2, vxa2, 1, 2, 3, 4); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb2)); vxa3 = wasm_v32x4_shuffle(vxa3, vxa3, 1, 2, 3, 4); vacc0x0123 = wasm_i32x4_add(vacc0x0123, wasm_i32x4_dot_i16x8(vxa0, vxb3)); vacc1x0123 = wasm_i32x4_add(vacc1x0123, wasm_i32x4_dot_i16x8(vxa1, vxb3)); vacc2x0123 = wasm_i32x4_add(vacc2x0123, wasm_i32x4_dot_i16x8(vxa2, vxb3)); vacc3x0123 = wasm_i32x4_add(vacc3x0123, wasm_i32x4_dot_i16x8(vxa3, vxb3)); w = (const uint8_t*) w + 32; k -= 8 * sizeof(uint8_t); } while (k != 0); p -= 4 * sizeof(void*); } while (p != 0); vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); v128_t vacc01x0123 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123); v128_t vacc23x0123 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc3x0123); v128_t vout = wasm_u8x16_narrow_i16x8(vacc01x0123, vacc23x0123); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); vout = wasm_u8x16_min(vout, voutput_max); if (nc >= 4) { wasm_v128_store32_lane(c3, vout, 3); wasm_v128_store32_lane(c2, vout, 2); wasm_v128_store32_lane(c1, vout, 1); wasm_v128_store32_lane(c0, vout, 0); c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride); c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride); c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride); c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride); a = (const uint8_t**restrict) ((uintptr_t) a - ks); nc -= 4; } else { if (nc & 2) { wasm_v128_store16_lane(c3, vout, 6); c3 += 2; wasm_v128_store16_lane(c2, vout, 4); c2 += 2; wasm_v128_store16_lane(c1, vout, 2); c1 += 2; wasm_v128_store16_lane(c0, vout, 0); c0 += 2; vout = wasm_u32x4_shr(vout, 16); } if (nc & 1) { wasm_v128_store8_lane(c3, vout, 12); wasm_v128_store8_lane(c2, vout, 8); wasm_v128_store8_lane(c1, vout, 4); wasm_v128_store8_lane(c0, vout, 0); } nc = 0; } } while (nc != 0); } void xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t vbias = wasm_v128_load64_splat(params->wasmsimd.bias); const v128_t va_multiplier = wasm_v128_load64_splat(params->wasmsimd.a_multiplier); const v128_t vb_multiplier = wasm_v128_load64_splat(params->wasmsimd.b_multiplier); const uint32_t vshift = params->wasmsimd.shift; const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.output_min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.output_max); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t vb01234567 = wasm_u16x8_load8x8(input_b); const v128_t va89ABCDEF = wasm_u16x8_load8x8(input_a + 8); const v128_t vb89ABCDEF = wasm_u16x8_load8x8(input_b + 8); const v128_t vaGHIJKLMN = wasm_u16x8_load8x8(input_a + 16); const v128_t vbGHIJKLMN = wasm_u16x8_load8x8(input_b + 16); const v128_t vaOPQRSTUV = wasm_u16x8_load8x8(input_a + 24); const v128_t vbOPQRSTUV = wasm_u16x8_load8x8(input_b + 24); input_a += 32; input_b += 32; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier)); v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va89ABCDEF), va_multiplier)); v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va89ABCDEF), va_multiplier)); v128_t vaccGHIJ = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccKLMN = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccOPQR = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vaOPQRSTUV), va_multiplier)); v128_t vaccSTUV = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vaOPQRSTUV), va_multiplier)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier)); vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb89ABCDEF), vb_multiplier)); vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb89ABCDEF), vb_multiplier)); vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vbGHIJKLMN), vb_multiplier)); vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vbGHIJKLMN), vb_multiplier)); vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vbOPQRSTUV), vb_multiplier)); vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vbOPQRSTUV), vb_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); vacc89AB = wasm_i32x4_shr(vacc89AB, vshift); vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift); vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift); vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift); vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift); vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point); v128_t voutGHIJKLMN = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN), voutput_zero_point); v128_t voutOPQRSTUV = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV), voutput_zero_point); v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); vout0123456789ABCDEF = wasm_u8x16_max(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = wasm_u8x16_max(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); output += 32; } if XNN_UNLIKELY(batch != 0) { do { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t vb01234567 = wasm_u16x8_load8x8(input_b); input_a += 8; input_b += 8; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier)); vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier)); vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_max(vout0123456701234567, voutput_min); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_multiplier = wasm_v128_load64_splat(params->wasmsimd.a_multiplier); const uint32_t vshift = params->wasmsimd.shift; const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.output_min); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.output_max); v128_t vbias = wasm_i32x4_splat((int32_t) *input_b * params->wasmsimd.b_multiplier[0]); vbias = wasm_i32x4_add(vbias, wasm_v128_load64_splat(params->wasmsimd.bias)); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t va89ABCDEF = wasm_u16x8_load8x8(input_a + 8); const v128_t vaGHIJKLMN = wasm_u16x8_load8x8(input_a + 16); const v128_t vaOPQRSTUV = wasm_u16x8_load8x8(input_a + 24); input_a += 32; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier)); v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va89ABCDEF), va_multiplier)); v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va89ABCDEF), va_multiplier)); v128_t vaccGHIJ = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccKLMN = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vaGHIJKLMN), va_multiplier)); v128_t vaccOPQR = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vaOPQRSTUV), va_multiplier)); v128_t vaccSTUV = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vaOPQRSTUV), va_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); vacc89AB = wasm_i32x4_shr(vacc89AB, vshift); vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift); vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift); vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift); vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift); vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point); v128_t voutGHIJKLMN = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN), voutput_zero_point); v128_t voutOPQRSTUV = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV), voutput_zero_point); v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); vout0123456789ABCDEF = wasm_u8x16_max(vout0123456789ABCDEF, voutput_min); voutGHIJKLMNOPQRSTUV = wasm_u8x16_max(voutGHIJKLMNOPQRSTUV, voutput_min); vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); wasm_v128_store(output, vout0123456789ABCDEF); wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); output += 32; } if XNN_UNLIKELY(batch != 0) { do { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); input_a += 8; v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier)); v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier)); vacc0123 = wasm_i32x4_shr(vacc0123, vshift); vacc4567 = wasm_i32x4_shr(vacc4567, vshift); v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_max(vout0123456701234567, voutput_min); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if XNN_LIKELY(batch >= (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; batch -= 8 * sizeof(uint8_t); } else { if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } batch = 0; } } while (batch != 0); } } void xnn_qu8_vcvt_ukernel__wasmsimd_x16( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point); const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { v128_t vacc0 = wasm_u16x8_load8x8(input); v128_t vacc1 = wasm_u16x8_load8x8(input + 8); input += 16; vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0); vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1); vacc0 = wasm_i16x8_shl(vacc0, 7); vacc1 = wasm_i16x8_shl(vacc1, 7); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1); wasm_v128_store(output, vy0); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { v128_t vacc = wasm_u16x8_load8x8(input); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vacc = wasm_i16x8_shl(vacc, 7); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); v128_t vacc = wasm_u16x8_load8x8(input); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vacc = wasm_i16x8_shl(vacc, 7); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point); const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier); const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { v128_t vx0 = wasm_v128_load(input); v128_t vx1 = wasm_v128_load(input + 16); input += 32; v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_low_u8x16(vx0)); v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_high_u8x16(vx0)); v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15); v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15); v128_t vacc2 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_low_u8x16(vx1)); v128_t vacc3 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_high_u8x16(vx1)); v128_t vmultiplier2 = wasm_i16x8_shr(vacc2, 15); v128_t vmultiplier3 = wasm_i16x8_shr(vacc3, 15); vacc0 = wasm_i16x8_shl(vacc0, 7); vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0); vacc1 = wasm_i16x8_shl(vacc1, 7); vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1); vacc2 = wasm_i16x8_shl(vacc2, 7); vmultiplier2 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier2); vacc3 = wasm_i16x8_shl(vacc3, 7); vmultiplier3 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier3); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier0); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier1); vacc2 = wasm_i16x8_q15mulr_sat(vacc2, vmultiplier2); vacc3 = wasm_i16x8_q15mulr_sat(vacc3, vmultiplier3); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point); vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point); const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1); const v128_t vy1 = wasm_u8x16_narrow_i16x8(vacc2, vacc3); wasm_v128_store(output, vy0); wasm_v128_store((output + 16), vy1); output += 32; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const v128_t vx = wasm_u16x8_load8x8(input); v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx); v128_t vmultiplier = wasm_i16x8_shr(vacc, 15); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); const v128_t vx = wasm_u16x8_load8x8(input); v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx); v128_t vmultiplier = wasm_i16x8_shr(vacc, 15); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point); const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff); const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base); const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point); for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { v128_t vacc0 = wasm_u16x8_load8x8(input); v128_t vacc1 = wasm_u16x8_load8x8(input + 8); input += 16; v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point); vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0); v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point); vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1); vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff); vacc0 = wasm_i16x8_shl(vacc0, 7); vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base); vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff); vacc1 = wasm_i16x8_shl(vacc1, 7); vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base); vacc0 = wasm_i16x8_q15mulr_sat(vacc0, vmultiplier0); vacc1 = wasm_i16x8_q15mulr_sat(vacc1, vmultiplier1); vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point); vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point); const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1); wasm_v128_store(output, vy0); output += 16; } for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { v128_t vacc = wasm_u16x8_load8x8(input); v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); input += 8; const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); wasm_v128_store64_lane(output, vy, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint8_t)); assert(batch <= 7 * sizeof(uint8_t)); v128_t vacc = wasm_u16x8_load8x8(input); v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point); vacc = wasm_i16x8_sub(vinput_zero_point, vacc); vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff); vacc = wasm_i16x8_shl(vacc, 7); vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base); vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier); vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point); v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.a_zero_point); const v128_t vb_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.b_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t vb01234567 = wasm_u16x8_load8x8(input_b); input_a += 8; input_b += 8; const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); const v128_t vxb01234567 = wasm_i16x8_sub(vb01234567, vb_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb01234567); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb01234567); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t vb01234567 = wasm_u16x8_load8x8(input_b); const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); const v128_t vxb01234567 = wasm_i16x8_sub(vb01234567, vb_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb01234567); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb01234567); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } } } } void xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8( size_t batch, const uint8_t* input_a, const uint8_t* input_b, uint8_t* output, const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input_a != NULL); assert(input_b != NULL); assert(output != NULL); const v128_t va_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.a_zero_point); const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); const v128_t vxb = wasm_i16x8_sub( wasm_i16x8_splat((int16_t) *input_b), wasm_v128_load64_splat(params->fp32_wasmsimd.b_zero_point)); for (; batch >= 8 * sizeof(uint8_t); batch -= 8 * sizeof(uint8_t)) { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); input_a += 8; const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); wasm_v128_store64_lane(output, vout0123456701234567, 0); output += 8; } if XNN_UNLIKELY(batch != 0) { { const v128_t va01234567 = wasm_u16x8_load8x8(input_a); const v128_t vxa01234567 = wasm_i16x8_sub(va01234567, va_zero_point); v128_t vacc0123 = wasm_i32x4_extmul_low_i16x8(vxa01234567, vxb); v128_t vacc4567 = wasm_i32x4_extmul_high_i16x8(vxa01234567, vxb); vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); vacc0123 = wasm_f32x4_mul(vacc0123, vscale); vacc4567 = wasm_f32x4_mul(vacc4567, vscale); vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vout0123456701234567, 0); vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vout0123456701234567, 0); } } } } void xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8( size_t output_pixels, size_t channels, const int8_t** restrict input, size_t input_offset, const int16_t* restrict weights, int8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset); const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset); const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset); const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset); input += 4; const v128_t valphah = wasm_i16x8_add( wasm_v128_xor( wasm_v128_load16_splat(weights), wasm_i32x4_const_splat(0xFFFF0000)), wasm_i32x4_const_splat(0x08010000)); const v128_t valphav = wasm_i32x4_extend_low_i16x8(wasm_v128_load16_splat(weights + 1)); weights += 2; const v128_t vrounding = wasm_i32x4_const_splat(0x00200000); size_t c = channels; for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) { const v128_t vtl01234567 = wasm_i16x8_load8x8(i0); i0 += 8; const v128_t vtr01234567 = wasm_i16x8_load8x8(i1); i1 += 8; const v128_t vbl01234567 = wasm_i16x8_load8x8(i2); i2 += 8; const v128_t vbr01234567 = wasm_i16x8_load8x8(i3); i3 += 8; const v128_t vdr01234567 = wasm_i16x8_sub(vbr01234567, vtr01234567); const v128_t vt0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vdl01234567 = wasm_i16x8_sub(vbl01234567, vtl01234567); const v128_t vt4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); const v128_t vd0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vd4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); v128_t vacc0123 = wasm_i32x4_mul(vd0123, valphav); v128_t vacc4567 = wasm_i32x4_mul(vd4567, valphav); vacc0123 = wasm_i32x4_add(wasm_i32x4_shl(vt0123, 11), vacc0123); vacc4567 = wasm_i32x4_add(wasm_i32x4_shl(vt4567, 11), vacc4567); vacc0123 = wasm_i32x4_shr(wasm_i16x8_add(vacc0123, vrounding), 22); vacc4567 = wasm_i32x4_shr(wasm_i16x8_add(vacc4567, vrounding), 22); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); const v128_t vo01234567 = wasm_i8x16_narrow_i16x8(vacc01234567, vacc01234567); wasm_v128_store64_lane(output, vo01234567, 0); output += 8; } if XNN_UNLIKELY(c != 0) { const v128_t vtl01234567 = wasm_i16x8_load8x8(i0); const v128_t vtr01234567 = wasm_i16x8_load8x8(i1); const v128_t vbl01234567 = wasm_i16x8_load8x8(i2); const v128_t vbr01234567 = wasm_i16x8_load8x8(i3); const v128_t vdr01234567 = wasm_i16x8_sub(vbr01234567, vtr01234567); const v128_t vt0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vdl01234567 = wasm_i16x8_sub(vbl01234567, vtl01234567); const v128_t vt4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); const v128_t vd0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vd4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); v128_t vacc0123 = wasm_i32x4_mul(vd0123, valphav); v128_t vacc4567 = wasm_i32x4_mul(vd4567, valphav); vacc0123 = wasm_i32x4_add(wasm_i32x4_shl(vt0123, 11), vacc0123); vacc4567 = wasm_i32x4_add(wasm_i32x4_shl(vt4567, 11), vacc4567); vacc0123 = wasm_i32x4_shr(wasm_i16x8_add(vacc0123, vrounding), 22); vacc4567 = wasm_i32x4_shr(wasm_i16x8_add(vacc4567, vrounding), 22); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vo01234567 = wasm_i8x16_narrow_i16x8(vacc01234567, vacc01234567); if (c & (4 * sizeof(int8_t))) { wasm_v128_store32_lane(output, vo01234567, 0); vo01234567 = wasm_u64x2_shr(vo01234567, 32); output += 4; } if (c & (2 * sizeof(int8_t))) { wasm_v128_store16_lane(output, vo01234567, 0); vo01234567 = wasm_u32x4_shr(vo01234567, 16); output += 2; } if (c & (1 * sizeof(int8_t))) { wasm_v128_store8_lane(output, vo01234567, 0); output += 1; } } output = (int8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16( size_t output_pixels, size_t kernel_elements, size_t channels, const int8_t** input, size_t input_offset, int8_t* output, size_t input_increment, size_t output_increment, const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); do { int8_t* o = output; { const int8_t* i0 = *input++; const int8_t* i1 = *input++; const int8_t* i2 = *input++; const int8_t* i3 = *input++; const int8_t* i4 = *input++; const int8_t* i5 = *input++; const int8_t* i6 = *input++; const int8_t* i7 = *input++; const int8_t* i8 = *input++; i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); i8 = (const int8_t*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 16; c -= 16) { const v128_t vi0 = wasm_v128_load(i0); i0 += 16; const v128_t vi1 = wasm_v128_load(i1); i1 += 16; const v128_t vi2 = wasm_v128_load(i2); i2 += 16; const v128_t vi3 = wasm_v128_load(i3); i3 += 16; const v128_t vi4 = wasm_v128_load(i4); i4 += 16; const v128_t vi5 = wasm_v128_load(i5); i5 += 16; const v128_t vi6 = wasm_v128_load(i6); i6 += 16; const v128_t vi7 = wasm_v128_load(i7); i7 += 16; const v128_t vi8 = wasm_v128_load(i8); i8 += 16; const v128_t vmax018 = wasm_i8x16_max(wasm_i8x16_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_i8x16_max(vi2, vi3); const v128_t vmax45 = wasm_i8x16_max(vi4, vi5); const v128_t vmax67 = wasm_i8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_i8x16_max(vmax23, vmax45); const v128_t vmax01678 = wasm_i8x16_max(vmax018, vmax67); v128_t vout = wasm_i8x16_max(vmax2345, vmax01678); vout = wasm_i8x16_min(vout, voutput_max); vout = wasm_i8x16_max(vout, voutput_min); wasm_v128_store(o, vout); o += 16; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vmax018 = wasm_i8x16_max(wasm_i8x16_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_i8x16_max(vi2, vi3); const v128_t vmax45 = wasm_i8x16_max(vi4, vi5); const v128_t vmax67 = wasm_i8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_i8x16_max(vmax23, vmax45); const v128_t vmax01678 = wasm_i8x16_max(vmax018, vmax67); v128_t vout = wasm_i8x16_max(vmax2345, vmax01678); vout = wasm_i8x16_min(vout, voutput_max); vout = wasm_i8x16_max(vout, voutput_min); if (c & 8) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 8; } if (c & 4) { wasm_v128_store32_lane(o, vout, 0); vout = wasm_u64x2_shr(vout, 32); o += 4; } if (c & 2) { wasm_v128_store16_lane(o, vout, 0); vout = wasm_u32x4_shr(vout, 16); o += 2; } if (c & 1) { wasm_v128_store8_lane(o, vout, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const int8_t* i0 = *input++; const int8_t* i1 = *input++; const int8_t* i2 = *input++; const int8_t* i3 = *input++; const int8_t* i4 = *input++; const int8_t* i5 = *input++; const int8_t* i6 = *input++; const int8_t* i7 = *input++; i0 = (const int8_t*) ((uintptr_t) i0 + input_offset); i1 = (const int8_t*) ((uintptr_t) i1 + input_offset); i2 = (const int8_t*) ((uintptr_t) i2 + input_offset); i3 = (const int8_t*) ((uintptr_t) i3 + input_offset); i4 = (const int8_t*) ((uintptr_t) i4 + input_offset); i5 = (const int8_t*) ((uintptr_t) i5 + input_offset); i6 = (const int8_t*) ((uintptr_t) i6 + input_offset); i7 = (const int8_t*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 16; c -= 16) { const v128_t vi0 = wasm_v128_load(i0); i0 += 16; const v128_t vi1 = wasm_v128_load(i1); i1 += 16; const v128_t vi2 = wasm_v128_load(i2); i2 += 16; const v128_t vi3 = wasm_v128_load(i3); i3 += 16; const v128_t vi4 = wasm_v128_load(i4); i4 += 16; const v128_t vi5 = wasm_v128_load(i5); i5 += 16; const v128_t vi6 = wasm_v128_load(i6); i6 += 16; const v128_t vi7 = wasm_v128_load(i7); i7 += 16; const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_i8x16_max(wasm_i8x16_max(vi0, vi1), vo); const v128_t vmax23 = wasm_i8x16_max(vi2, vi3); const v128_t vmax45 = wasm_i8x16_max(vi4, vi5); const v128_t vmax67 = wasm_i8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_i8x16_max(vmax23, vmax45); const v128_t vmax0167 = wasm_i8x16_max(vmax01, vmax67); v128_t vout = wasm_i8x16_max(vmax2345, vmax0167); vout = wasm_i8x16_min(vout, voutput_max); vout = wasm_i8x16_max(vout, voutput_min); wasm_v128_store(o, vout); o += 16; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_i8x16_max(wasm_i8x16_max(vi0, vi1), vo); const v128_t vmax23 = wasm_i8x16_max(vi2, vi3); const v128_t vmax45 = wasm_i8x16_max(vi4, vi5); const v128_t vmax67 = wasm_i8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_i8x16_max(vmax23, vmax45); const v128_t vmax0167 = wasm_i8x16_max(vmax01, vmax67); v128_t vout = wasm_i8x16_max(vmax2345, vmax0167); vout = wasm_i8x16_min(vout, voutput_max); vout = wasm_i8x16_max(vout, voutput_min); if (c & 8) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 8; } if (c & 4) { wasm_v128_store32_lane(o, vout, 0); vout = wasm_u64x2_shr(vout, 32); o += 4; } if (c & 2) { wasm_v128_store16_lane(o, vout, 0); vout = wasm_u32x4_shr(vout, 16); o += 2; } if (c & 1) { wasm_v128_store8_lane(o, vout, 0); o += 1; } } } input = (const int8_t**) ((uintptr_t) input + input_increment); output = (int8_t*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_s8_vclamp_ukernel__wasmsimd_x64( size_t batch, const int8_t* input, int8_t* output, const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(int8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); for (; batch >= 64; batch -= 64) { v128_t vacc0 = wasm_v128_load(input); v128_t vacc1 = wasm_v128_load(input + 16); v128_t vacc2 = wasm_v128_load(input + 32); v128_t vacc3 = wasm_v128_load(input + 48); input += 64; vacc0 = wasm_i8x16_max(vacc0, voutput_min); vacc1 = wasm_i8x16_max(vacc1, voutput_min); vacc2 = wasm_i8x16_max(vacc2, voutput_min); vacc3 = wasm_i8x16_max(vacc3, voutput_min); vacc0 = wasm_i8x16_min(vacc0, voutput_max); vacc1 = wasm_i8x16_min(vacc1, voutput_max); vacc2 = wasm_i8x16_min(vacc2, voutput_max); vacc3 = wasm_i8x16_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 16, vacc1); wasm_v128_store(output + 32, vacc2); wasm_v128_store(output + 48, vacc3); output += 64; } for (; batch >= 16; batch -= 16) { v128_t vacc = wasm_v128_load(input); input += 16; vacc = wasm_i8x16_min(vacc, voutput_max); vacc = wasm_i8x16_max(vacc, voutput_min); wasm_v128_store(output, vacc); output += 16; } if XNN_UNLIKELY(batch != 0) { v128_t vacc = wasm_v128_load(input); vacc = wasm_i8x16_min(vacc, voutput_max); vacc = wasm_i8x16_max(vacc, voutput_min); if (batch & 8) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 8; } if (batch & 4) { wasm_v128_store32_lane(output, vacc, 0); vacc = wasm_u64x2_shr(vacc, 32); output += 4; } if (batch & 2) { wasm_v128_store16_lane(output, vacc, 0); vacc = wasm_u32x4_shr(vacc, 16); output += 2; } if (batch & 1) { wasm_v128_store8_lane(output, vacc, 0); } } } void xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8( size_t output_pixels, size_t channels, const uint8_t** restrict input, size_t input_offset, const int16_t* restrict weights, uint8_t* restrict output, size_t output_increment) XNN_OOB_READS { assert(output_pixels != 0); assert(channels != 0); do { const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset); const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset); const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset); const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset); input += 4; const v128_t valphah = wasm_i16x8_add( wasm_v128_xor( wasm_v128_load16_splat(weights), wasm_i32x4_const_splat(0xFFFF0000)), wasm_i32x4_const_splat(0x08010000)); const v128_t valphav = wasm_i32x4_extend_low_i16x8(wasm_v128_load16_splat(weights + 1)); weights += 2; const v128_t vrounding = wasm_i32x4_const_splat(0x00200000); size_t c = channels; for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) { const v128_t vtl01234567 = wasm_u16x8_load8x8(i0); i0 += 8; const v128_t vtr01234567 = wasm_u16x8_load8x8(i1); i1 += 8; const v128_t vbl01234567 = wasm_u16x8_load8x8(i2); i2 += 8; const v128_t vbr01234567 = wasm_u16x8_load8x8(i3); i3 += 8; const v128_t vdr01234567 = wasm_i16x8_sub(vbr01234567, vtr01234567); const v128_t vt0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vdl01234567 = wasm_i16x8_sub(vbl01234567, vtl01234567); const v128_t vt4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); const v128_t vd0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vd4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); v128_t vacc0123 = wasm_i32x4_mul(vd0123, valphav); v128_t vacc4567 = wasm_i32x4_mul(vd4567, valphav); vacc0123 = wasm_i32x4_add(wasm_i32x4_shl(vt0123, 11), vacc0123); vacc4567 = wasm_i32x4_add(wasm_i32x4_shl(vt4567, 11), vacc4567); vacc0123 = wasm_u32x4_shr(wasm_i16x8_add(vacc0123, vrounding), 22); vacc4567 = wasm_u32x4_shr(wasm_i16x8_add(vacc4567, vrounding), 22); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); const v128_t vo01234567 = wasm_u8x16_narrow_i16x8(vacc01234567, vacc01234567); wasm_v128_store64_lane(output, vo01234567, 0); output += 8; } if XNN_UNLIKELY(c != 0) { const v128_t vtl01234567 = wasm_u16x8_load8x8(i0); const v128_t vtr01234567 = wasm_u16x8_load8x8(i1); const v128_t vbl01234567 = wasm_u16x8_load8x8(i2); const v128_t vbr01234567 = wasm_u16x8_load8x8(i3); const v128_t vdr01234567 = wasm_i16x8_sub(vbr01234567, vtr01234567); const v128_t vt0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vdl01234567 = wasm_i16x8_sub(vbl01234567, vtl01234567); const v128_t vt4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vtr01234567, vtl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); const v128_t vd0123 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 0, 8, 1, 9, 2, 10, 3, 11), valphah); const v128_t vd4567 = wasm_i32x4_dot_i16x8(wasm_v16x8_shuffle(vdr01234567, vdl01234567, 4, 12, 5, 13, 6, 14, 7, 15), valphah); v128_t vacc0123 = wasm_i32x4_mul(vd0123, valphav); v128_t vacc4567 = wasm_i32x4_mul(vd4567, valphav); vacc0123 = wasm_i32x4_add(wasm_i32x4_shl(vt0123, 11), vacc0123); vacc4567 = wasm_i32x4_add(wasm_i32x4_shl(vt4567, 11), vacc4567); vacc0123 = wasm_u32x4_shr(wasm_i16x8_add(vacc0123, vrounding), 22); vacc4567 = wasm_u32x4_shr(wasm_i16x8_add(vacc4567, vrounding), 22); const v128_t vacc01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); v128_t vo01234567 = wasm_u8x16_narrow_i16x8(vacc01234567, vacc01234567); if (c & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vo01234567, 0); vo01234567 = wasm_u64x2_shr(vo01234567, 32); output += 4; } if (c & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vo01234567, 0); vo01234567 = wasm_u32x4_shr(vo01234567, 16); output += 2; } if (c & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vo01234567, 0); output += 1; } } output = (uint8_t*) ((uintptr_t) output + output_increment); } while (--output_pixels != 0); } void xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16( size_t output_pixels, size_t kernel_elements, size_t channels, const uint8_t** input, size_t input_offset, uint8_t* output, size_t input_increment, size_t output_increment, const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_pixels != 0); assert(kernel_elements != 0); assert(channels != 0); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); do { uint8_t* o = output; { const uint8_t* i0 = *input++; const uint8_t* i1 = *input++; const uint8_t* i2 = *input++; const uint8_t* i3 = *input++; const uint8_t* i4 = *input++; const uint8_t* i5 = *input++; const uint8_t* i6 = *input++; const uint8_t* i7 = *input++; const uint8_t* i8 = *input++; i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset); if (kernel_elements < 2) { i1 = i0; } if (kernel_elements <= 2) { i2 = i0; } if (kernel_elements < 4) { i3 = i0; } if (kernel_elements <= 4) { i4 = i0; } if (kernel_elements < 6) { i5 = i0; } if (kernel_elements <= 6) { i6 = i0; } if (kernel_elements < 8) { i7 = i0; } if (kernel_elements <= 8) { i8 = i0; } size_t c = channels; for (; c >= 16; c -= 16) { const v128_t vi0 = wasm_v128_load(i0); i0 += 16; const v128_t vi1 = wasm_v128_load(i1); i1 += 16; const v128_t vi2 = wasm_v128_load(i2); i2 += 16; const v128_t vi3 = wasm_v128_load(i3); i3 += 16; const v128_t vi4 = wasm_v128_load(i4); i4 += 16; const v128_t vi5 = wasm_v128_load(i5); i5 += 16; const v128_t vi6 = wasm_v128_load(i6); i6 += 16; const v128_t vi7 = wasm_v128_load(i7); i7 += 16; const v128_t vi8 = wasm_v128_load(i8); i8 += 16; const v128_t vmax018 = wasm_u8x16_max(wasm_u8x16_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_u8x16_max(vi2, vi3); const v128_t vmax45 = wasm_u8x16_max(vi4, vi5); const v128_t vmax67 = wasm_u8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_u8x16_max(vmax23, vmax45); const v128_t vmax01678 = wasm_u8x16_max(vmax018, vmax67); v128_t vout = wasm_u8x16_max(vmax2345, vmax01678); vout = wasm_u8x16_min(vout, voutput_max); vout = wasm_u8x16_max(vout, voutput_min); wasm_v128_store(o, vout); o += 16; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vi8 = wasm_v128_load(i8); const v128_t vmax018 = wasm_u8x16_max(wasm_u8x16_max(vi0, vi1), vi8); const v128_t vmax23 = wasm_u8x16_max(vi2, vi3); const v128_t vmax45 = wasm_u8x16_max(vi4, vi5); const v128_t vmax67 = wasm_u8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_u8x16_max(vmax23, vmax45); const v128_t vmax01678 = wasm_u8x16_max(vmax018, vmax67); v128_t vout = wasm_u8x16_max(vmax2345, vmax01678); vout = wasm_u8x16_min(vout, voutput_max); vout = wasm_u8x16_max(vout, voutput_min); if (c & 8) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 8; } if (c & 4) { wasm_v128_store32_lane(o, vout, 0); vout = wasm_u64x2_shr(vout, 32); o += 4; } if (c & 2) { wasm_v128_store16_lane(o, vout, 0); vout = wasm_u32x4_shr(vout, 16); o += 2; } if (c & 1) { wasm_v128_store8_lane(o, vout, 0); o += 1; } } } for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) { const uint8_t* i0 = *input++; const uint8_t* i1 = *input++; const uint8_t* i2 = *input++; const uint8_t* i3 = *input++; const uint8_t* i4 = *input++; const uint8_t* i5 = *input++; const uint8_t* i6 = *input++; const uint8_t* i7 = *input++; i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset); i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset); i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset); i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset); i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset); i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset); i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset); i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset); if (k < 2) { i1 = i0; } if (k <= 2) { i2 = i0; } if (k < 4) { i3 = i0; } if (k <= 4) { i4 = i0; } if (k < 6) { i5 = i0; } if (k <= 6) { i6 = i0; } if (k < 8) { i7 = i0; } o = output; size_t c = channels; for (; c >= 16; c -= 16) { const v128_t vi0 = wasm_v128_load(i0); i0 += 16; const v128_t vi1 = wasm_v128_load(i1); i1 += 16; const v128_t vi2 = wasm_v128_load(i2); i2 += 16; const v128_t vi3 = wasm_v128_load(i3); i3 += 16; const v128_t vi4 = wasm_v128_load(i4); i4 += 16; const v128_t vi5 = wasm_v128_load(i5); i5 += 16; const v128_t vi6 = wasm_v128_load(i6); i6 += 16; const v128_t vi7 = wasm_v128_load(i7); i7 += 16; const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_u8x16_max(wasm_u8x16_max(vi0, vi1), vo); const v128_t vmax23 = wasm_u8x16_max(vi2, vi3); const v128_t vmax45 = wasm_u8x16_max(vi4, vi5); const v128_t vmax67 = wasm_u8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_u8x16_max(vmax23, vmax45); const v128_t vmax0167 = wasm_u8x16_max(vmax01, vmax67); v128_t vout = wasm_u8x16_max(vmax2345, vmax0167); vout = wasm_u8x16_min(vout, voutput_max); vout = wasm_u8x16_max(vout, voutput_min); wasm_v128_store(o, vout); o += 16; } if (c != 0) { const v128_t vi0 = wasm_v128_load(i0); const v128_t vi1 = wasm_v128_load(i1); const v128_t vi2 = wasm_v128_load(i2); const v128_t vi3 = wasm_v128_load(i3); const v128_t vi4 = wasm_v128_load(i4); const v128_t vi5 = wasm_v128_load(i5); const v128_t vi6 = wasm_v128_load(i6); const v128_t vi7 = wasm_v128_load(i7); const v128_t vo = wasm_v128_load(o); const v128_t vmax01 = wasm_u8x16_max(wasm_u8x16_max(vi0, vi1), vo); const v128_t vmax23 = wasm_u8x16_max(vi2, vi3); const v128_t vmax45 = wasm_u8x16_max(vi4, vi5); const v128_t vmax67 = wasm_u8x16_max(vi6, vi7); const v128_t vmax2345 = wasm_u8x16_max(vmax23, vmax45); const v128_t vmax0167 = wasm_u8x16_max(vmax01, vmax67); v128_t vout = wasm_u8x16_max(vmax2345, vmax0167); vout = wasm_u8x16_min(vout, voutput_max); vout = wasm_u8x16_max(vout, voutput_min); if (c & 8) { wasm_v128_store64_lane(o, vout, 0); vout = wasm_v64x2_shuffle(vout, vout, 1, 1); o += 8; } if (c & 4) { wasm_v128_store32_lane(o, vout, 0); vout = wasm_u64x2_shr(vout, 32); o += 4; } if (c & 2) { wasm_v128_store16_lane(o, vout, 0); vout = wasm_u32x4_shr(vout, 16); o += 2; } if (c & 1) { wasm_v128_store8_lane(o, vout, 0); o += 1; } } } input = (const uint8_t**) ((uintptr_t) input + input_increment); output = (uint8_t*) ((uintptr_t) o + output_increment); } while (--output_pixels != 0); } void xnn_u8_vclamp_ukernel__wasmsimd_x64( size_t batch, const uint8_t* input, uint8_t* output, const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t voutput_max = wasm_v128_load64_splat(params->wasmsimd.max); const v128_t voutput_min = wasm_v128_load64_splat(params->wasmsimd.min); for (; batch >= 64; batch -= 64) { v128_t vacc0 = wasm_v128_load(input); v128_t vacc1 = wasm_v128_load(input + 16); v128_t vacc2 = wasm_v128_load(input + 32); v128_t vacc3 = wasm_v128_load(input + 48); input += 64; vacc0 = wasm_u8x16_max(vacc0, voutput_min); vacc1 = wasm_u8x16_max(vacc1, voutput_min); vacc2 = wasm_u8x16_max(vacc2, voutput_min); vacc3 = wasm_u8x16_max(vacc3, voutput_min); vacc0 = wasm_u8x16_min(vacc0, voutput_max); vacc1 = wasm_u8x16_min(vacc1, voutput_max); vacc2 = wasm_u8x16_min(vacc2, voutput_max); vacc3 = wasm_u8x16_min(vacc3, voutput_max); wasm_v128_store(output, vacc0); wasm_v128_store(output + 16, vacc1); wasm_v128_store(output + 32, vacc2); wasm_v128_store(output + 48, vacc3); output += 64; } for (; batch >= 16; batch -= 16) { v128_t vacc = wasm_v128_load(input); input += 16; vacc = wasm_u8x16_min(vacc, voutput_max); vacc = wasm_u8x16_max(vacc, voutput_min); wasm_v128_store(output, vacc); output += 16; } if XNN_UNLIKELY(batch != 0) { v128_t vacc = wasm_v128_load(input); vacc = wasm_u8x16_min(vacc, voutput_max); vacc = wasm_u8x16_max(vacc, voutput_min); if (batch & 8) { wasm_v128_store64_lane(output, vacc, 0); vacc = wasm_v64x2_shuffle(vacc, vacc, 1, 1); output += 8; } if (batch & 4) { wasm_v128_store32_lane(output, vacc, 0); vacc = wasm_u64x2_shr(vacc, 32); output += 4; } if (batch & 2) { wasm_v128_store16_lane(output, vacc, 0); vacc = wasm_u32x4_shr(vacc, 16); output += 2; } if (batch & 1) { wasm_v128_store8_lane(output, vacc, 0); } } } void xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd( const uint16_t* input, uint16_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x16_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint16_t)); assert(input_stride >= block_width * sizeof(uint16_t)); const size_t tile_height = 8; const size_t tile_width = 8; const size_t tile_hbytes = tile_height * sizeof(uint16_t); const size_t tile_wbytes = tile_width * sizeof(uint16_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes; const uint16_t* i0 = input; uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 7); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 8; bh -= 8) { const v128_t v3_0 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_1 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_2 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_3 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_4 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_5 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_6 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_7 = wasm_v128_load(i0); i0 = (uint16_t*) ((uintptr_t) i0 + input_stride); const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15); o = (uint16_t*) ((uintptr_t) o + oN_offset); wasm_v128_store(o, v0_7); uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store(o, v0_6); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store(o, v0_5); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store(o, v0_4); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store(o, v0_3); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store(o, v0_2); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store(o, v0_1); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store(o, v0_0); } o = (uint16_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const v128_t v3_0 = wasm_v128_load(i0); const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const v128_t v3_1 = wasm_v128_load(i1); const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const v128_t v3_2 = wasm_v128_load(i2); const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(bh < 4) { i3 = i2; } const v128_t v3_3 = wasm_v128_load(i3); const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(bh <= 4) { i4 = i3; } const v128_t v3_4 = wasm_v128_load(i4); const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(bh < 6) { i5 = i4; } const v128_t v3_5 = wasm_v128_load(i5); const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(bh <= 6) { i6 = i5; } const v128_t v3_6 = wasm_v128_load(i6); const v128_t v3_7 = wasm_v128_xor(v3_0, v3_0); const v128_t v2_0 = wasm_v16x8_shuffle(v3_0, v3_4, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_1 = wasm_v16x8_shuffle(v3_0, v3_4, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_2 = wasm_v16x8_shuffle(v3_1, v3_5, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_3 = wasm_v16x8_shuffle(v3_1, v3_5, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_4 = wasm_v16x8_shuffle(v3_2, v3_6, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_5 = wasm_v16x8_shuffle(v3_2, v3_6, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v2_6 = wasm_v16x8_shuffle(v3_3, v3_7, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v2_7 = wasm_v16x8_shuffle(v3_3, v3_7, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_0 = wasm_v16x8_shuffle(v2_0, v2_4, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_1 = wasm_v16x8_shuffle(v2_0, v2_4, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_2 = wasm_v16x8_shuffle(v2_1, v2_5, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_3 = wasm_v16x8_shuffle(v2_1, v2_5, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_4 = wasm_v16x8_shuffle(v2_2, v2_6, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_5 = wasm_v16x8_shuffle(v2_2, v2_6, 4, 12, 5, 13, 6, 14, 7, 15); const v128_t v1_6 = wasm_v16x8_shuffle(v2_3, v2_7, 0, 8, 1, 9, 2, 10, 3, 11); const v128_t v1_7 = wasm_v16x8_shuffle(v2_3, v2_7, 4, 12, 5, 13, 6, 14, 7, 15); v128_t v0_0 = wasm_v16x8_shuffle(v1_0, v1_4, 0, 8, 1, 9, 2, 10, 3, 11); v128_t v0_1 = wasm_v16x8_shuffle(v1_0, v1_4, 4, 12, 5, 13, 6, 14, 7, 15); v128_t v0_2 = wasm_v16x8_shuffle(v1_1, v1_5, 0, 8, 1, 9, 2, 10, 3, 11); v128_t v0_3 = wasm_v16x8_shuffle(v1_1, v1_5, 4, 12, 5, 13, 6, 14, 7, 15); v128_t v0_4 = wasm_v16x8_shuffle(v1_2, v1_6, 0, 8, 1, 9, 2, 10, 3, 11); v128_t v0_5 = wasm_v16x8_shuffle(v1_2, v1_6, 4, 12, 5, 13, 6, 14, 7, 15); v128_t v0_6 = wasm_v16x8_shuffle(v1_3, v1_7, 0, 8, 1, 9, 2, 10, 3, 11); v128_t v0_7 = wasm_v16x8_shuffle(v1_3, v1_7, 4, 12, 5, 13, 6, 14, 7, 15); if (bh & 4) { o = (uint16_t*) ((uintptr_t) o + oN_stride); wasm_v128_store64_lane(o, v0_7, 0); uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store64_lane(o, v0_6, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store64_lane(o, v0_5, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store64_lane(o, v0_4, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store64_lane(o, v0_3, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store64_lane(o, v0_2, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store64_lane(o, v0_1, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store64_lane(o, v0_0, 0); o += 4; v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1); v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1); v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1); v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1); v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1); v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1); v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1); v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1); } if (bh & 2) { o = (uint16_t*) ((uintptr_t) o + oN_stride); wasm_v128_store32_lane(o, v0_7, 0); uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store32_lane(o, v0_6, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store32_lane(o, v0_5, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store32_lane(o, v0_4, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store32_lane(o, v0_3, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store32_lane(o, v0_2, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store32_lane(o, v0_1, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store32_lane(o, v0_0, 0); o += 2; v0_0 = wasm_u64x2_shr(v0_0, 32); v0_1 = wasm_u64x2_shr(v0_1, 32); v0_2 = wasm_u64x2_shr(v0_2, 32); v0_3 = wasm_u64x2_shr(v0_3, 32); v0_4 = wasm_u64x2_shr(v0_4, 32); v0_5 = wasm_u64x2_shr(v0_5, 32); v0_6 = wasm_u64x2_shr(v0_6, 32); v0_7 = wasm_u64x2_shr(v0_7, 32); } if (bh & 1) { o = (uint16_t*) ((uintptr_t) o + oN_stride); wasm_v128_store16_lane(o, v0_7, 0); uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store16_lane(o, v0_6, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store16_lane(o, v0_5, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store16_lane(o, v0_4, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store16_lane(o, v0_3, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store16_lane(o, v0_2, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store16_lane(o, v0_1, 0); oN = (uint16_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store16_lane(o, v0_0, 0); } } i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset); o = (uint16_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x32_packw_gemm_goi_ukernel_x2c4__wasmsimd_x4( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint32_t* weights, const uint32_t* bias, uint32_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 2); assert(kr == 4); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); do { // NC main loop multiple of 2 const uint32_t* w0 = (const uint32_t*) weights; size_t n = nc; for (; n >= 2; n -= 2) { if XNN_LIKELY(bias != NULL) { packed_weights[0] = bias[0]; packed_weights[1] = bias[1]; bias += 2; } else { packed_weights[0] = 0; packed_weights[1] = 0; } packed_weights += 2; const uint32_t* w1 = w0 + kc; // KC main loop multiple of 2x4 size_t k = kc; for (; k >= 4; k -= 4) { // Read blocks of 2x4 // a b c d // e f g h const v128_t v0 = wasm_v128_load(w0); w0 += 4; const v128_t v1 = wasm_v128_load(w1); w1 += 4; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v1); packed_weights += 8; } // KC remainder (1..3) if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { case 1: { // Read blocks of 2x1 // a // e const v128_t v0 = wasm_v128_load32_zero(w0); ++w0; const v128_t v1 = wasm_v128_load32_zero(w1); ++w1; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v1); packed_weights += 8; break; } case 2: { // Read blocks of 2x2 // a b // e f const v128_t v0 = wasm_v128_load64_zero(w0); w0 += 2; const v128_t v1 = wasm_v128_load64_zero(w1); w1 += 2; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v1); packed_weights += 8; break; } case 3: { // Read blocks of 2x3 // a b c // e f g v128_t v0 = wasm_v128_load64_zero(w0); v0 = wasm_v128_load32_lane(w0 + 2, v0, 2); w0 += 3; v128_t v1 = wasm_v128_load64_zero(w1); v1 = wasm_v128_load32_lane(w1 + 2, v1, 2); w1 += 3; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v1); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w1; } // NC remainder (1..1) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 1); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (2 - n); } else { packed_weights[0] = 0; packed_weights[1] = 0; packed_weights += 2; } // NR remainder has less than 2 rows so last row is not loaded // KC main loop multiple of 2x4 size_t k = kc; for (; k >= 4; k -= 4) { // Read blocks of 2x4 // a b c d // e f g h const v128_t v0 = wasm_v128_load(w0); w0 += 4; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v0); packed_weights += 8; } // KC remainder (1..3) if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { case 1: { // Read blocks of 1x1 // a const v128_t v0 = wasm_v128_load32_zero(w0); ++w0; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v0); packed_weights += 8; break; } case 2: { // Read blocks of 1x2 // a b const v128_t v0 = wasm_v128_load64_zero(w0); w0 += 2; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v0); packed_weights += 8; break; } case 3: { // Read blocks of 1x3 // a b c v128_t v0 = wasm_v128_load64_zero(w0); v0 = wasm_v128_load32_lane(w0 + 2, v0, 2); w0 += 3; wasm_v128_store(packed_weights, v0); wasm_v128_store(packed_weights + 4, v0); packed_weights += 8; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x32_packw_gemm_goi_ukernel_x8__wasmsimd_x4( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const uint32_t* weights, const uint32_t* bias, uint32_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); do { // NC main loop multiple of 8 const uint32_t* w0 = (const uint32_t*) weights; size_t n = nc; for (; n >= 8; n -= 8) { if XNN_LIKELY(bias != NULL) { const v128_t vb0123 = wasm_v128_load(bias); const v128_t vb4567 = wasm_v128_load(bias + 4); bias += 8; wasm_v128_store(packed_weights, vb0123); wasm_v128_store(packed_weights + 4, vb4567); } else { const v128_t vzero = wasm_i32x4_const_splat(0); wasm_v128_store(packed_weights, vzero); wasm_v128_store(packed_weights + 4, vzero); } packed_weights += 8; const uint32_t* w1 = w0 + kc; const uint32_t* w2 = w1 + kc; const uint32_t* w3 = w2 + kc; const uint32_t* w4 = w3 + kc; const uint32_t* w5 = w4 + kc; const uint32_t* w6 = w5 + kc; const uint32_t* w7 = w6 + kc; // KC main loop multiple of 8x4 size_t k = kc; for (; k >= 4; k -= 4) { const v128_t v0x0123 = wasm_v128_load(w0); w0 += 4; const v128_t v1x0123 = wasm_v128_load(w1); w1 += 4; const v128_t v2x0123 = wasm_v128_load(w2); w2 += 4; const v128_t v3x0123 = wasm_v128_load(w3); w3 += 4; const v128_t v4x0123 = wasm_v128_load(w4); w4 += 4; const v128_t v5x0123 = wasm_v128_load(w5); w5 += 4; const v128_t v6x0123 = wasm_v128_load(w6); w6 += 4; const v128_t v7x0123 = wasm_v128_load(w7); w7 += 4; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x0123, v1x0123, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x0123, v3x0123, 0, 4, 1, 5); const v128_t v01x2_01x3 = wasm_v32x4_shuffle(v0x0123, v1x0123, 2, 6, 3, 7); const v128_t v23x2_23x3 = wasm_v32x4_shuffle(v2x0123, v3x0123, 2, 6, 3, 7); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x0123, v5x0123, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x0123, v7x0123, 0, 4, 1, 5); const v128_t v45x2_45x3 = wasm_v32x4_shuffle(v4x0123, v5x0123, 2, 6, 3, 7); const v128_t v67x2_67x3 = wasm_v32x4_shuffle(v6x0123, v7x0123, 2, 6, 3, 7); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v0123x2 = wasm_v64x2_shuffle(v01x2_01x3, v23x2_23x3, 0, 2); const v128_t v0123x3 = wasm_v64x2_shuffle(v01x2_01x3, v23x2_23x3, 1, 3); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); const v128_t v4567x2 = wasm_v64x2_shuffle(v45x2_45x3, v67x2_67x3, 0, 2); const v128_t v4567x3 = wasm_v64x2_shuffle(v45x2_45x3, v67x2_67x3, 1, 3); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); wasm_v128_store(packed_weights + 16, v0123x2); wasm_v128_store(packed_weights + 20, v4567x2); wasm_v128_store(packed_weights + 24, v0123x3); wasm_v128_store(packed_weights + 28, v4567x3); packed_weights += 32; } if XNN_UNLIKELY(k != 0) { // KC remainder (1..3) assert(k >= 1); assert(k <= 3); switch (k) { case 1: { v128_t v0123x0 = wasm_v128_load32_zero(w0); w0 += 1; v128_t v4567x0 = wasm_v128_load32_zero(w4); w4 += 1; v0123x0 = wasm_v128_load32_lane(w1, v0123x0, 1); w1 += 1; v4567x0 = wasm_v128_load32_lane(w5, v4567x0, 1); w5 += 1; v0123x0 = wasm_v128_load32_lane(w2, v0123x0, 2); w2 += 1; v4567x0 = wasm_v128_load32_lane(w6, v4567x0, 2); w6 += 1; v0123x0 = wasm_v128_load32_lane(w3, v0123x0, 3); w3 += 1; v4567x0 = wasm_v128_load32_lane(w7, v4567x0, 3); w7 += 1; wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); packed_weights += 8; break; } case 2: { const v128_t v0x01 = wasm_v128_load64_zero(w0); w0 += 2; const v128_t v1x01 = wasm_v128_load64_zero(w1); w1 += 2; const v128_t v2x01 = wasm_v128_load64_zero(w2); w2 += 2; const v128_t v3x01 = wasm_v128_load64_zero(w3); w3 += 2; const v128_t v4x01 = wasm_v128_load64_zero(w4); w4 += 2; const v128_t v5x01 = wasm_v128_load64_zero(w5); w5 += 2; const v128_t v6x01 = wasm_v128_load64_zero(w6); w6 += 2; const v128_t v7x01 = wasm_v128_load64_zero(w7); w7 += 2; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x01, v1x01, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x01, v3x01, 0, 4, 1, 5); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x01, v5x01, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x01, v7x01, 0, 4, 1, 5); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); packed_weights += 16; break; } case 3: { v128_t v0x012 = wasm_v128_load64_zero(w0); w0 += 2; v128_t v1x012 = wasm_v128_load64_zero(w1); w1 += 2; v128_t v2x012 = wasm_v128_load64_zero(w2); w2 += 2; v128_t v3x012 = wasm_v128_load64_zero(w3); w3 += 2; v128_t v4x012 = wasm_v128_load64_zero(w4); w4 += 2; v128_t v5x012 = wasm_v128_load64_zero(w5); w5 += 2; v128_t v6x012 = wasm_v128_load64_zero(w6); w6 += 2; v128_t v7x012 = wasm_v128_load64_zero(w7); w7 += 2; v0x012 = wasm_v128_load32_lane(w0, v0x012, 2); w0 += 1; v1x012 = wasm_v128_load32_lane(w1, v1x012, 2); w1 += 1; v2x012 = wasm_v128_load32_lane(w2, v2x012, 2); w2 += 1; v3x012 = wasm_v128_load32_lane(w3, v3x012, 2); w3 += 1; v4x012 = wasm_v128_load32_lane(w4, v4x012, 2); w4 += 1; v5x012 = wasm_v128_load32_lane(w5, v5x012, 2); w5 += 1; v6x012 = wasm_v128_load32_lane(w6, v6x012, 2); w6 += 1; v7x012 = wasm_v128_load32_lane(w7, v7x012, 2); w7 += 1; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x012, v1x012, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x012, v3x012, 0, 4, 1, 5); const v128_t v01x2 = wasm_v32x4_shuffle(v0x012, v1x012, 2, 6, 3, 7); const v128_t v23x2 = wasm_v32x4_shuffle(v2x012, v3x012, 2, 6, 3, 7); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x012, v5x012, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x012, v7x012, 0, 4, 1, 5); const v128_t v45x2 = wasm_v32x4_shuffle(v4x012, v5x012, 2, 6, 3, 7); const v128_t v67x2 = wasm_v32x4_shuffle(v6x012, v7x012, 2, 6, 3, 7); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v0123x2 = wasm_v64x2_shuffle(v01x2, v23x2, 0, 2); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); const v128_t v4567x2 = wasm_v64x2_shuffle(v45x2, v67x2, 0, 2); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); wasm_v128_store(packed_weights + 16, v0123x2); wasm_v128_store(packed_weights + 20, v4567x2); packed_weights += 24; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { assert(n >= 1); assert(n <= 7); if XNN_LIKELY(bias != NULL) { size_t nb = n; do { *packed_weights++ = *bias++; } while (--nb != 0); packed_weights += (8 - n); } else { const v128_t vzero = wasm_i32x4_const_splat(0); wasm_v128_store(packed_weights, vzero); wasm_v128_store(packed_weights + 4, vzero); packed_weights += 8; } const uint32_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const uint32_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const uint32_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const uint32_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const uint32_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const uint32_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } size_t k = kc; for (; k >= 4; k -= 4) { const v128_t v0x0123 = wasm_v128_load(w0); w0 += 4; const v128_t v1x0123 = wasm_v128_load(w1); w1 += 4; const v128_t v2x0123 = wasm_v128_load(w2); w2 += 4; const v128_t v3x0123 = wasm_v128_load(w3); w3 += 4; const v128_t v4x0123 = wasm_v128_load(w4); w4 += 4; const v128_t v5x0123 = wasm_v128_load(w5); w5 += 4; const v128_t v6x0123 = wasm_v128_load(w6); w6 += 4; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x0123, v1x0123, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x0123, v3x0123, 0, 4, 1, 5); const v128_t v01x2_01x3 = wasm_v32x4_shuffle(v0x0123, v1x0123, 2, 6, 3, 7); const v128_t v23x2_23x3 = wasm_v32x4_shuffle(v2x0123, v3x0123, 2, 6, 3, 7); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x0123, v5x0123, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x0123, v6x0123, 0, 4, 1, 5); const v128_t v45x2_45x3 = wasm_v32x4_shuffle(v4x0123, v5x0123, 2, 6, 3, 7); const v128_t v67x2_67x3 = wasm_v32x4_shuffle(v6x0123, v6x0123, 2, 6, 3, 7); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v0123x2 = wasm_v64x2_shuffle(v01x2_01x3, v23x2_23x3, 0, 2); const v128_t v0123x3 = wasm_v64x2_shuffle(v01x2_01x3, v23x2_23x3, 1, 3); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); const v128_t v4567x2 = wasm_v64x2_shuffle(v45x2_45x3, v67x2_67x3, 0, 2); const v128_t v4567x3 = wasm_v64x2_shuffle(v45x2_45x3, v67x2_67x3, 1, 3); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); wasm_v128_store(packed_weights + 16, v0123x2); wasm_v128_store(packed_weights + 20, v4567x2); wasm_v128_store(packed_weights + 24, v0123x3); wasm_v128_store(packed_weights + 28, v4567x3); packed_weights += 32; } // KC remainder (1..3) if XNN_UNLIKELY(k != 0) { assert(k >= 1); assert(k <= 3); switch (k) { case 1: { v128_t v0123x0 = wasm_v128_load32_zero(w0); w0 += 1; v128_t v4567x0 = wasm_v128_load32_zero(w4); w4 += 1; v0123x0 = wasm_v128_load32_lane(w1, v0123x0, 1); w1 += 1; v4567x0 = wasm_v128_load32_lane(w5, v4567x0, 1); w5 += 1; v0123x0 = wasm_v128_load32_lane(w2, v0123x0, 2); w2 += 1; v4567x0 = wasm_v128_load32_lane(w6, v4567x0, 2); w6 += 1; v0123x0 = wasm_v128_load32_lane(w3, v0123x0, 3); w3 += 1; wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); packed_weights += 8; break; } case 2: { const v128_t v0x01 = wasm_v128_load64_zero(w0); w0 += 2; const v128_t v1x01 = wasm_v128_load64_zero(w1); w1 += 2; const v128_t v2x01 = wasm_v128_load64_zero(w2); w2 += 2; const v128_t v3x01 = wasm_v128_load64_zero(w3); w3 += 2; const v128_t v4x01 = wasm_v128_load64_zero(w4); w4 += 2; const v128_t v5x01 = wasm_v128_load64_zero(w5); w5 += 2; const v128_t v6x01 = wasm_v128_load64_zero(w6); w6 += 2; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x01, v1x01, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x01, v3x01, 0, 4, 1, 5); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x01, v5x01, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x01, v6x01, 0, 4, 1, 5); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); packed_weights += 16; break; } case 3: { v128_t v0x012 = wasm_v128_load64_zero(w0); w0 += 2; v128_t v1x012 = wasm_v128_load64_zero(w1); w1 += 2; v128_t v2x012 = wasm_v128_load64_zero(w2); w2 += 2; v128_t v3x012 = wasm_v128_load64_zero(w3); w3 += 2; v128_t v4x012 = wasm_v128_load64_zero(w4); w4 += 2; v128_t v5x012 = wasm_v128_load64_zero(w5); w5 += 2; v128_t v6x012 = wasm_v128_load64_zero(w6); w6 += 2; v0x012 = wasm_v128_load32_lane(w0, v0x012, 2); w0 += 1; v1x012 = wasm_v128_load32_lane(w1, v1x012, 2); w1 += 1; v2x012 = wasm_v128_load32_lane(w2, v2x012, 2); w2 += 1; v3x012 = wasm_v128_load32_lane(w3, v3x012, 2); w3 += 1; v4x012 = wasm_v128_load32_lane(w4, v4x012, 2); w4 += 1; v5x012 = wasm_v128_load32_lane(w5, v5x012, 2); w5 += 1; v6x012 = wasm_v128_load32_lane(w6, v6x012, 2); w6 += 1; const v128_t v01x0_01x1 = wasm_v32x4_shuffle(v0x012, v1x012, 0, 4, 1, 5); const v128_t v23x0_23x1 = wasm_v32x4_shuffle(v2x012, v3x012, 0, 4, 1, 5); const v128_t v01x2 = wasm_v32x4_shuffle(v0x012, v1x012, 2, 6, 3, 7); const v128_t v23x2 = wasm_v32x4_shuffle(v2x012, v3x012, 2, 6, 3, 7); const v128_t v45x0_45x1 = wasm_v32x4_shuffle(v4x012, v5x012, 0, 4, 1, 5); const v128_t v67x0_67x1 = wasm_v32x4_shuffle(v6x012, v6x012, 0, 4, 1, 5); const v128_t v45x2 = wasm_v32x4_shuffle(v4x012, v5x012, 2, 6, 3, 7); const v128_t v67x2 = wasm_v32x4_shuffle(v6x012, v6x012, 2, 6, 3, 7); const v128_t v0123x0 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 0, 2); const v128_t v0123x1 = wasm_v64x2_shuffle(v01x0_01x1, v23x0_23x1, 1, 3); const v128_t v0123x2 = wasm_v64x2_shuffle(v01x2, v23x2, 0, 2); const v128_t v4567x0 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 0, 2); const v128_t v4567x1 = wasm_v64x2_shuffle(v45x0_45x1, v67x0_67x1, 1, 3); const v128_t v4567x2 = wasm_v64x2_shuffle(v45x2, v67x2, 0, 2); wasm_v128_store(packed_weights, v0123x0); wasm_v128_store(packed_weights + 4, v4567x0); wasm_v128_store(packed_weights + 8, v0123x1); wasm_v128_store(packed_weights + 12, v4567x1); wasm_v128_store(packed_weights + 16, v0123x2); wasm_v128_store(packed_weights + 20, v4567x2); packed_weights += 24; break; } default: XNN_UNREACHABLE; } } packed_weights = (uint32_t*) ((uintptr_t) packed_weights + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd( const uint32_t* input, uint32_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x32_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint32_t)); assert(input_stride >= block_width * sizeof(uint32_t)); const size_t tile_height = 4; const size_t tile_width = 4; const size_t tile_hbytes = tile_height * sizeof(uint32_t); const size_t tile_wbytes = tile_width * sizeof(uint32_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t) - tile_hbytes; const uint32_t* i0 = input; uint32_t* o = (uint32_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 3); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 4; bh -= 4) { const v128_t v2_0 = wasm_v128_load(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const v128_t v2_1 = wasm_v128_load(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const v128_t v2_2 = wasm_v128_load(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const v128_t v2_3 = wasm_v128_load(i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_stride); const v128_t v1_0 = wasm_v32x4_shuffle(v2_0, v2_2, 0, 4, 1, 5); const v128_t v1_1 = wasm_v32x4_shuffle(v2_0, v2_2, 2, 6, 3, 7); const v128_t v1_2 = wasm_v32x4_shuffle(v2_1, v2_3, 0, 4, 1, 5); const v128_t v1_3 = wasm_v32x4_shuffle(v2_1, v2_3, 2, 6, 3, 7); const v128_t v0_0 = wasm_v32x4_shuffle(v1_0, v1_2, 0, 4, 1, 5); const v128_t v0_1 = wasm_v32x4_shuffle(v1_0, v1_2, 2, 6, 3, 7); const v128_t v0_2 = wasm_v32x4_shuffle(v1_1, v1_3, 0, 4, 1, 5); const v128_t v0_3 = wasm_v32x4_shuffle(v1_1, v1_3, 2, 6, 3, 7); o = (uint32_t*) ((uintptr_t) o + oN_offset); wasm_v128_store(o, v0_3); uint32_t *oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store(o, v0_2); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store(o, v0_1); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store(o, v0_0); } o = (uint32_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const v128_t v2_0 = wasm_v128_load(i0); const uint32_t *i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const v128_t v2_1 = wasm_v128_load(i1); const uint32_t *i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const v128_t v2_2 = wasm_v128_load(i2); const v128_t v2_3 = wasm_v128_xor(v2_0, v2_0); const v128_t v1_0 = wasm_v32x4_shuffle(v2_0, v2_2, 0, 4, 1, 5); const v128_t v1_1 = wasm_v32x4_shuffle(v2_0, v2_2, 2, 6, 3, 7); const v128_t v1_2 = wasm_v32x4_shuffle(v2_1, v2_3, 0, 4, 1, 5); const v128_t v1_3 = wasm_v32x4_shuffle(v2_1, v2_3, 2, 6, 3, 7); v128_t v0_0 = wasm_v32x4_shuffle(v1_0, v1_2, 0, 4, 1, 5); v128_t v0_1 = wasm_v32x4_shuffle(v1_0, v1_2, 2, 6, 3, 7); v128_t v0_2 = wasm_v32x4_shuffle(v1_1, v1_3, 0, 4, 1, 5); v128_t v0_3 = wasm_v32x4_shuffle(v1_1, v1_3, 2, 6, 3, 7); if (bh & 2) { o = (uint32_t*) ((uintptr_t) o + oN_stride); wasm_v128_store64_lane(o, v0_3, 0); uint32_t *oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store64_lane(o, v0_2, 0); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store64_lane(o, v0_1, 0); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store64_lane(o, v0_0, 0); o += 2; v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1); v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1); v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1); v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1); } if (bh & 1) { o = (uint32_t*) ((uintptr_t) o + oN_stride); wasm_v128_store32_lane(o, v0_3, 0); uint32_t *oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store32_lane(o, v0_2, 0); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store32_lane(o, v0_1, 0); oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store32_lane(o, v0_0, 0); } } i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); o = (uint32_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_x32_unpool_ukernel__wasmsimd( size_t kernel_elements, size_t channels, uint32_t fill, const uint32_t* input, const uint32_t* index, uint32_t** output) { // Pre-initialize outputs with constant. const v128_t vfill = wasm_i32x4_splat(fill); uint32_t** os = output; do { float* o = (float*) *os++; size_t c = channels; for (; c >= 4; c -= 4) { wasm_v128_store(o, vfill); o += 4; } if (c != 0) { if (c & 2) { wasm_v128_store64_lane(o, vfill, 0); o += 2; } if (c & 1) { wasm_v128_store32_lane(o, vfill, 0); } } } while (--kernel_elements != 0); // Copy indexed elements to output. size_t offset = 0; do { const uint32_t i = *index++; *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++; offset += sizeof(uint32_t); } while (--channels != 0); } void xnn_x32_zip_x2_ukernel__wasmsimd( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % sizeof(uint32_t) == 0); const float* x = (const float*) input; const float* y = (const float*) ((uintptr_t) x + n); float* o = (float*) output; while (n >= 4 * sizeof(uint32_t)) { const v128_t vx = wasm_v128_load(x); x += 4; const v128_t vy = wasm_v128_load(y); y += 4; const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); wasm_v128_store(o, vxy_lo); wasm_v128_store(o + 4, vxy_hi); o += 8; n -= 4 * sizeof(uint32_t); } if XNN_UNLIKELY(n != 0) { if (n & (2 * sizeof(uint32_t))) { const double vx = *((const double*) x); x += 2; const double vy = *((const double*) y); y += 2; const v128_t vxy = wasm_f64x2_make(vx, vy); wasm_v128_store(o, wasm_v32x4_shuffle(vxy, vxy, 0, 2, 1, 3)); o += 4; } if (n & (1 * sizeof(uint32_t))) { const float vx = *x; const float vy = *y; o[0] = vx; o[1] = vy; } } } void xnn_x32_zip_x3_ukernel__wasmsimd( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % sizeof(uint32_t) == 0); const float* x = (const float*) input; const float* y = (const float*) ((uintptr_t) x + n); const float* z = (const float*) ((uintptr_t) y + n); float* o = (float*) output; while (n >= 4 * sizeof(uint32_t)) { // vx = ( x3, x2, x1, x0 ) const v128_t vx = wasm_v128_load(x); x += 4; // vy = ( y3, y2, y1, y0 ) const v128_t vy = wasm_v128_load(y); y += 4; // vz = ( z3, z2, z1, z0 ) const v128_t vz = wasm_v128_load(z); z += 4; // vxy = ( y2, y0, x2, x0 ) const v128_t vxy = wasm_v32x4_shuffle(vx, vy, 0, 2, 4, 6); // vyz = ( z3, z1, y3, y1 ) const v128_t vyz = wasm_v32x4_shuffle(vy, vz, 1, 3, 5, 7); // vzx = ( x3, x1, z2, z0 ) const v128_t vzx = wasm_v32x4_shuffle(vz, vx, 0, 2, 5, 7); // vxyz0 = ( x1, z0, y0, x0 ) const v128_t vxyz0 = wasm_v32x4_shuffle(vxy, vzx, 0, 2, 4, 6); // vxyz1 = ( y2, x2, z1, y1 ) const v128_t vxyz1 = wasm_v32x4_shuffle(vyz, vxy, 0, 2, 5, 7); // vxyz2 = ( z3, y3, x3, z2 ) const v128_t vxyz2 = wasm_v32x4_shuffle(vzx, vyz, 1, 3, 5, 7); wasm_v128_store(o, vxyz0); wasm_v128_store(o + 4, vxyz1); wasm_v128_store(o + 8, vxyz2); o += 12; n -= 4 * sizeof(uint32_t); } if XNN_UNLIKELY(n != 0) { do { const float vx = *x++; const float vy = *y++; const float vz = *z++; o[0] = vx; o[1] = vy; o[2] = vz; o += 3; n -= sizeof(uint32_t); } while (n != 0); } } void xnn_x32_zip_x4_ukernel__wasmsimd( size_t n, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % sizeof(uint32_t) == 0); const float* x = (const float*) input; const float* y = (const float*) ((uintptr_t) x + n); const float* z = (const float*) ((uintptr_t) y + n); const float* w = (const float*) ((uintptr_t) z + n); float* o = (float*) output; while (n >= 4 * sizeof(uint32_t)) { const v128_t vx = wasm_v128_load(x); x += 4; const v128_t vy = wasm_v128_load(y); y += 4; const v128_t vz = wasm_v128_load(z); z += 4; const v128_t vw = wasm_v128_load(w); w += 4; const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5); const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7); const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5); const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7); const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5); const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7); wasm_v128_store(o, vxyzw0); wasm_v128_store(o + 4, vxyzw1); wasm_v128_store(o + 8, vxyzw2); wasm_v128_store(o + 12, vxyzw3); o += 16; n -= 4 * sizeof(uint32_t); } if XNN_UNLIKELY(n != 0) { if (n & (2 * sizeof(uint32_t))) { const double vx = *((const double*) x); x += 2; const double vy = *((const double*) y); y += 2; const double vz = *((const double*) z); z += 2; const double vw = *((const double*) w); w += 2; const v128_t vxy = wasm_f64x2_make(vx, vy); const v128_t vzw = wasm_f64x2_make(vz, vw); const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6); const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7); wasm_v128_store(o, vxyzw_lo); wasm_v128_store(o + 4, vxyzw_hi); o += 8; } if (n & (1 * sizeof(uint32_t))) { const float vx = *x; const float vy = *y; const float vz = *z; const float vw = *w; o[0] = vx; o[1] = vy; o[2] = vz; o[3] = vw; } } } void xnn_x32_zip_xm_ukernel__wasmsimd( size_t n, size_t m, const uint32_t* input, uint32_t* output) { assert(n != 0); assert(n % sizeof(uint32_t) == 0); assert(m >= 4); const float* w = (const float*) input; float* o = (float*) output; const size_t group_increment = m * 4; const size_t input_increment = n * 3; const size_t output_increment = 4 * sizeof(uint32_t) - m * n; const float* last_input = (const float*) ((uintptr_t) input + n * (m - 1)); float* last_output = (float*) ((uintptr_t) output + (m * 4 - 4 * sizeof(uint32_t))); for (size_t i = 0; i < m; i += 4) { w = (const float*) ((uintptr_t) w + input_increment); if (w >= last_input) { w = last_input; } const float* z = (const float*) ((uintptr_t) w - n); const float* y = (const float*) ((uintptr_t) z - n); const float* x = (const float*) ((uintptr_t) y - n); size_t k = n; while (k >= 4 * sizeof(uint32_t)) { const v128_t vx = wasm_v128_load((const v128_t*) x); x += 4; const v128_t vy = wasm_v128_load((const v128_t*) y); y += 4; const v128_t vz = wasm_v128_load((const v128_t*) z); z += 4; const v128_t vw = wasm_v128_load((const v128_t*) w); w += 4; const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5); const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7); const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5); const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7); const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5); const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7); wasm_v128_store(o, vxyzw0); o = (float*) ((uintptr_t) o + group_increment); wasm_v128_store(o, vxyzw1); o = (float*) ((uintptr_t) o + group_increment); wasm_v128_store(o, vxyzw2); o = (float*) ((uintptr_t) o + group_increment); wasm_v128_store(o, vxyzw3); o = (float*) ((uintptr_t) o + group_increment); k -= 4 * sizeof(uint32_t); } if XNN_UNLIKELY(k != 0) { if (k & (2 * sizeof(uint32_t))) { const double vx = *((const double*) x); x += 2; const double vy = *((const double*) y); y += 2; const double vz = *((const double*) z); z += 2; const double vw = *((const double*) w); w += 2; const v128_t vxy = wasm_f64x2_make(vx, vy); const v128_t vzw = wasm_f64x2_make(vz, vw); const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6); const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7); wasm_v128_store(o, vxyzw_lo); o = (float*) ((uintptr_t) o + group_increment); wasm_v128_store(o, vxyzw_hi); o = (float*) ((uintptr_t) o + group_increment); } if (k & (1 * sizeof(uint32_t))) { const float vx = *x; const float vy = *y; const float vz = *z; const float vw = *w++; o[0] = vx; o[1] = vy; o[2] = vz; o[3] = vw; o = (float*) ((uintptr_t) o + group_increment); } } o = (float*) ((uintptr_t) o + output_increment); if (o > last_output) { o = last_output; } } } void xnn_x8_lut_ukernel__wasmsimd_x32( size_t batch, const uint8_t* input, uint8_t* output, const uint8_t table[restrict XNN_MIN_ELEMENTS(256)]) { assert(batch != 0); assert(batch % sizeof(uint8_t) == 0); assert(input != NULL); assert(output != NULL); const v128_t vtable0 = wasm_v128_load(table); const v128_t vtable1 = wasm_v128_load(table + 16); const v128_t vtable2 = wasm_v128_load(table + 32); const v128_t vtable3 = wasm_v128_load(table + 48); const v128_t vtable4 = wasm_v128_load(table + 64); const v128_t vtable5 = wasm_v128_load(table + 80); const v128_t vtable6 = wasm_v128_load(table + 96); const v128_t vtable7 = wasm_v128_load(table + 112); const v128_t vtable8 = wasm_v128_load(table + 128); const v128_t vtable9 = wasm_v128_load(table + 144); const v128_t vtable10 = wasm_v128_load(table + 160); const v128_t vtable11 = wasm_v128_load(table + 176); const v128_t vtable12 = wasm_v128_load(table + 192); const v128_t vtable13 = wasm_v128_load(table + 208); const v128_t vtable14 = wasm_v128_load(table + 224); const v128_t vtable15 = wasm_v128_load(table + 240); const v128_t voffset = wasm_i8x16_const_splat(16); for (; batch >= 32 * sizeof(uint8_t); batch -= 32 * sizeof(uint8_t)) { v128_t vx0 = wasm_v128_load(input); v128_t vx1 = wasm_v128_load(input + 16); input += 32; v128_t vy0 = wasm_i8x16_swizzle(vtable0, vx0); v128_t vy1 = wasm_i8x16_swizzle(vtable0, vx1); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable1, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable1, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable2, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable2, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable3, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable3, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable4, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable4, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable5, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable5, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable6, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable6, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable7, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable7, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable8, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable8, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable9, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable9, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable10, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable10, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable11, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable11, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable12, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable12, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable13, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable13, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable14, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable14, vx1)); vx0 = wasm_i8x16_sub(vx0, voffset); vy0 = wasm_v128_or(vy0, wasm_i8x16_swizzle(vtable15, vx0)); vx1 = wasm_i8x16_sub(vx1, voffset); vy1 = wasm_v128_or(vy1, wasm_i8x16_swizzle(vtable15, vx1)); wasm_v128_store(output, vy0); wasm_v128_store(output + 16, vy1); output += 32; } for (; batch >= 16 * sizeof(uint8_t); batch -= 16 * sizeof(uint8_t)) { v128_t vx = wasm_v128_load(input); input += 16; v128_t vy = wasm_i8x16_swizzle(vtable0, vx); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx)); wasm_v128_store(output, vy); output += 16; } if XNN_UNLIKELY(batch != 0) { v128_t vx = wasm_v128_load(input); v128_t vy = wasm_i8x16_swizzle(vtable0, vx); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable1, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable2, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable3, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable4, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable5, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable6, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable7, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable8, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable9, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable10, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable11, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable12, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable13, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable14, vx)); vx = wasm_i8x16_sub(vx, voffset); vy = wasm_v128_or(vy, wasm_i8x16_swizzle(vtable15, vx)); if (batch & (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vy, 0); vy = wasm_v64x2_shuffle(vy, vy, 1, 1); output += 8; } if (batch & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vy, 0); vy = wasm_u64x2_shr(vy, 32); output += 4; } if (batch & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vy, 0); vy = wasm_u32x4_shr(vy, 16); output += 2; } if (batch & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vy, 0); } } } void xnn_x8_packw_gemm_goi_ukernel_x8__scalar_int_x4( size_t g, size_t nc, size_t kc, size_t nr, size_t kr, size_t sr, const int8_t* weights, const uint32_t* bias, int8_t* packed_weights, size_t extra_bytes, const void* params) { assert(g != 0); assert(nc != 0); assert(kc != 0); assert(nr == 8); // This kernel is for NR=8 assert(kr == 1); assert(sr == 1); assert(weights != NULL); assert(packed_weights != NULL); int8_t* out = (int8_t*) packed_weights; const uint32_t* b = (const uint32_t*) bias; do { // NC main loop multiple of 8 const int8_t* w0 = (const int8_t*) weights; size_t n = nc; for (;n >= 8; n -= 8) { if XNN_LIKELY(b != NULL) { ((uint32_t*) out)[0] = b[0]; ((uint32_t*) out)[1] = b[1]; ((uint32_t*) out)[2] = b[2]; ((uint32_t*) out)[3] = b[3]; ((uint32_t*) out)[4] = b[4]; ((uint32_t*) out)[5] = b[5]; ((uint32_t*) out)[6] = b[6]; ((uint32_t*) out)[7] = b[7]; b += 8; } else { ((uint32_t*) out)[0] = 0; ((uint32_t*) out)[1] = 0; ((uint32_t*) out)[2] = 0; ((uint32_t*) out)[3] = 0; ((uint32_t*) out)[4] = 0; ((uint32_t*) out)[5] = 0; ((uint32_t*) out)[6] = 0; ((uint32_t*) out)[7] = 0; } out += 8 * sizeof(uint32_t); const int8_t* w1 = w0 + kc; const int8_t* w2 = w1 + kc; const int8_t* w3 = w2 + kc; const int8_t* w4 = w3 + kc; const int8_t* w5 = w4 + kc; const int8_t* w6 = w5 + kc; const int8_t* w7 = w6 + kc; // KC main loop multiple of 8x4 size_t k = kc; for (; k >= 4; k -= 4) { const int8_t v00 = w0[0]; const int8_t v01 = w0[1]; const int8_t v02 = w0[2]; const int8_t v03 = w0[3]; w0 += 4; const int8_t v10 = w1[0]; const int8_t v11 = w1[1]; const int8_t v12 = w1[2]; const int8_t v13 = w1[3]; w1 += 4; const int8_t v20 = w2[0]; const int8_t v21 = w2[1]; const int8_t v22 = w2[2]; const int8_t v23 = w2[3]; w2 += 4; const int8_t v30 = w3[0]; const int8_t v31 = w3[1]; const int8_t v32 = w3[2]; const int8_t v33 = w3[3]; w3 += 4; const int8_t v40 = w4[0]; const int8_t v41 = w4[1]; const int8_t v42 = w4[2]; const int8_t v43 = w4[3]; w4 += 4; const int8_t v50 = w5[0]; const int8_t v51 = w5[1]; const int8_t v52 = w5[2]; const int8_t v53 = w5[3]; w5 += 4; const int8_t v60 = w6[0]; const int8_t v61 = w6[1]; const int8_t v62 = w6[2]; const int8_t v63 = w6[3]; w6 += 4; const int8_t v70 = w7[0]; const int8_t v71 = w7[1]; const int8_t v72 = w7[2]; const int8_t v73 = w7[3]; w7 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[7] = v70; out[8] = v01; out[9] = v11; out[10] = v21; out[11] = v31; out[12] = v41; out[13] = v51; out[14] = v61; out[15] = v71; out[16] = v02; out[17] = v12; out[18] = v22; out[19] = v32; out[20] = v42; out[21] = v52; out[22] = v62; out[23] = v72; out[24] = v03; out[25] = v13; out[26] = v23; out[27] = v33; out[28] = v43; out[29] = v53; out[30] = v63; out[31] = v73; out += 32; } // KC remainder for (; k != 0; --k) { const int8_t v0 = *w0++; out[0] = v0; const int8_t v1 = *w1++; out[1] = v1; const int8_t v2 = *w2++; out[2] = v2; const int8_t v3 = *w3++; out[3] = v3; const int8_t v4 = *w4++; out[4] = v4; const int8_t v5 = *w5++; out[5] = v5; const int8_t v6 = *w6++; out[6] = v6; const int8_t v7 = *w7++; out[7] = v7; out += 8; } out = (int8_t*) ((uintptr_t) out + extra_bytes); w0 = w7; } // NC remainder (1..7) if XNN_UNLIKELY(n != 0) { if XNN_LIKELY(b != NULL) { size_t nb = n; do { *((uint32_t*) out) = *b++; out += sizeof(uint32_t); } while (--nb != 0); } else { size_t nb = n; do { *((uint32_t*) out) = 0; out += sizeof(uint32_t); } while (--nb != 0); } out += (8 - n) * sizeof(uint32_t); // NR remainder has less than 8 rows so last row is not loaded const int8_t* w1 = w0 + kc; if XNN_UNPREDICTABLE(n < 2) { w1 = w0; } const int8_t* w2 = w1 + kc; if XNN_UNPREDICTABLE(n <= 2) { w2 = w1; } const int8_t* w3 = w2 + kc; if XNN_UNPREDICTABLE(n < 4) { w3 = w2; } const int8_t* w4 = w3 + kc; if XNN_UNPREDICTABLE(n <= 4) { w4 = w3; } const int8_t* w5 = w4 + kc; if XNN_UNPREDICTABLE(n < 6) { w5 = w4; } const int8_t* w6 = w5 + kc; if XNN_UNPREDICTABLE(n <= 6) { w6 = w5; } // KC main loop multiple of 8x4 size_t k = kc; for (; k >= 4; k -= 4) { const int8_t v00 = w0[0]; const int8_t v01 = w0[1]; const int8_t v02 = w0[2]; const int8_t v03 = w0[3]; w0 += 4; const int8_t v10 = w1[0]; const int8_t v11 = w1[1]; const int8_t v12 = w1[2]; const int8_t v13 = w1[3]; w1 += 4; const int8_t v20 = w2[0]; const int8_t v21 = w2[1]; const int8_t v22 = w2[2]; const int8_t v23 = w2[3]; w2 += 4; const int8_t v30 = w3[0]; const int8_t v31 = w3[1]; const int8_t v32 = w3[2]; const int8_t v33 = w3[3]; w3 += 4; const int8_t v40 = w4[0]; const int8_t v41 = w4[1]; const int8_t v42 = w4[2]; const int8_t v43 = w4[3]; w4 += 4; const int8_t v50 = w5[0]; const int8_t v51 = w5[1]; const int8_t v52 = w5[2]; const int8_t v53 = w5[3]; w5 += 4; const int8_t v60 = w6[0]; const int8_t v61 = w6[1]; const int8_t v62 = w6[2]; const int8_t v63 = w6[3]; w6 += 4; out[0] = v00; out[1] = v10; out[2] = v20; out[3] = v30; out[4] = v40; out[5] = v50; out[6] = v60; out[8] = v01; out[9] = v11; out[10] = v21; out[11] = v31; out[12] = v41; out[13] = v51; out[14] = v61; out[16] = v02; out[17] = v12; out[18] = v22; out[19] = v32; out[20] = v42; out[21] = v52; out[22] = v62; out[24] = v03; out[25] = v13; out[26] = v23; out[27] = v33; out[28] = v43; out[29] = v53; out[30] = v63; out += 32; } // KC remainder of 1..3 for (; k != 0; --k) { const int8_t v0 = *w0++; out[0] = v0; const int8_t v1 = *w1++; out[1] = v1; const int8_t v2 = *w2++; out[2] = v2; const int8_t v3 = *w3++; out[3] = v3; const int8_t v4 = *w4++; out[4] = v4; const int8_t v5 = *w5++; out[5] = v5; const int8_t v6 = *w6++; out[6] = v6; out += 8; } out = (int8_t*) ((uintptr_t) out + extra_bytes); } weights += nc * kc; } while (--g != 0); } void xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd( const uint8_t* input, uint8_t* output, size_t input_stride, size_t output_stride, size_t block_width, size_t block_height, const union xnn_x8_transpose_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(output_stride >= block_height * sizeof(uint8_t)); assert(input_stride >= block_width * sizeof(uint8_t)); const size_t tile_height = 16; const size_t tile_width = 16; const size_t tile_hbytes = tile_height * sizeof(uint8_t); const size_t tile_wbytes = tile_width * sizeof(uint8_t); const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes; const uint8_t* i0 = input; uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes); const size_t minus_output_stride = -output_stride; do { const size_t rem = min(block_width - 1, 15); const size_t oN_stride = rem * output_stride; const size_t oN_offset = oN_stride + tile_hbytes; size_t bh = block_height; for (; bh >= 16; bh -= 16) { const v128_t v4_0 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_1 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_2 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_3 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_4 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_5 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_6 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_7 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_8 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_9 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_10 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_11 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_12 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_13 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_14 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v4_15 = wasm_v128_load(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride); const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); o = (uint8_t*) ((uintptr_t) o + oN_offset); wasm_v128_store(o, v0_15); uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 15) { o = oN; } wasm_v128_store(o, v0_14); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 15) { o = oN; } wasm_v128_store(o, v0_13); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 13) { o = oN; } wasm_v128_store(o, v0_12); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 13) { o = oN; } wasm_v128_store(o, v0_11); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 11) { o = oN; } wasm_v128_store(o, v0_10); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 11) { o = oN; } wasm_v128_store(o, v0_9); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 9) { o = oN; } wasm_v128_store(o, v0_8); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 9) { o = oN; } wasm_v128_store(o, v0_7); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store(o, v0_6); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store(o, v0_5); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store(o, v0_4); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store(o, v0_3); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store(o, v0_2); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store(o, v0_1); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store(o, v0_0); } o = (uint8_t*) ((uintptr_t) o + tile_hbytes); if (bh != 0) { const v128_t v4_0 = wasm_v128_load(i0); const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); if XNN_UNPREDICTABLE(bh < 2) { i1 = i0; } const v128_t v4_1 = wasm_v128_load(i1); const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); if XNN_UNPREDICTABLE(bh <= 2) { i2 = i1; } const v128_t v4_2 = wasm_v128_load(i2); const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); if XNN_UNPREDICTABLE(bh < 4) { i3 = i2; } const v128_t v4_3 = wasm_v128_load(i3); const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); if XNN_UNPREDICTABLE(bh <= 4) { i4 = i3; } const v128_t v4_4 = wasm_v128_load(i4); const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); if XNN_UNPREDICTABLE(bh < 6) { i5 = i4; } const v128_t v4_5 = wasm_v128_load(i5); const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); if XNN_UNPREDICTABLE(bh <= 6) { i6 = i5; } const v128_t v4_6 = wasm_v128_load(i6); const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride); if XNN_UNPREDICTABLE(bh < 8) { i7 = i6; } const v128_t v4_7 = wasm_v128_load(i7); const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride); if XNN_UNPREDICTABLE(bh <= 8) { i8 = i7; } const v128_t v4_8 = wasm_v128_load(i8); const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride); if XNN_UNPREDICTABLE(bh < 10) { i9 = i8; } const v128_t v4_9 = wasm_v128_load(i9); const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride); if XNN_UNPREDICTABLE(bh <= 10) { i10 = i9; } const v128_t v4_10 = wasm_v128_load(i10); const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride); if XNN_UNPREDICTABLE(bh < 12) { i11 = i10; } const v128_t v4_11 = wasm_v128_load(i11); const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride); if XNN_UNPREDICTABLE(bh <= 12) { i12 = i11; } const v128_t v4_12 = wasm_v128_load(i12); const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride); if XNN_UNPREDICTABLE(bh < 14) { i13 = i12; } const v128_t v4_13 = wasm_v128_load(i13); const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride); if XNN_UNPREDICTABLE(bh <= 14) { i14 = i13; } const v128_t v4_14 = wasm_v128_load(i14); const v128_t v4_15 = wasm_v128_xor(v4_0, v4_0); const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); if (bh & 8) { o = (uint8_t*) ((uintptr_t) o + oN_stride); wasm_v128_store64_lane(o, v0_15, 0); uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 15) { o = oN; } wasm_v128_store64_lane(o, v0_14, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 15) { o = oN; } wasm_v128_store64_lane(o, v0_13, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 13) { o = oN; } wasm_v128_store64_lane(o, v0_12, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 13) { o = oN; } wasm_v128_store64_lane(o, v0_11, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 11) { o = oN; } wasm_v128_store64_lane(o, v0_10, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 11) { o = oN; } wasm_v128_store64_lane(o, v0_9, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 9) { o = oN; } wasm_v128_store64_lane(o, v0_8, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 9) { o = oN; } wasm_v128_store64_lane(o, v0_7, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store64_lane(o, v0_6, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store64_lane(o, v0_5, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store64_lane(o, v0_4, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store64_lane(o, v0_3, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store64_lane(o, v0_2, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store64_lane(o, v0_1, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store64_lane(o, v0_0, 0); o += 8; v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1); v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1); v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1); v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1); v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1); v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1); v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1); v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1); v0_8 = wasm_v64x2_shuffle(v0_8, v0_8, 1, 1); v0_9 = wasm_v64x2_shuffle(v0_9, v0_9, 1, 1); v0_10 = wasm_v64x2_shuffle(v0_10, v0_10, 1, 1); v0_11 = wasm_v64x2_shuffle(v0_11, v0_11, 1, 1); v0_12 = wasm_v64x2_shuffle(v0_12, v0_12, 1, 1); v0_13 = wasm_v64x2_shuffle(v0_13, v0_13, 1, 1); v0_14 = wasm_v64x2_shuffle(v0_14, v0_14, 1, 1); v0_15 = wasm_v64x2_shuffle(v0_15, v0_15, 1, 1); } if (bh & 4) { o = (uint8_t*) ((uintptr_t) o + oN_stride); wasm_v128_store32_lane(o, v0_15, 0); uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 15) { o = oN; } wasm_v128_store32_lane(o, v0_14, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 15) { o = oN; } wasm_v128_store32_lane(o, v0_13, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 13) { o = oN; } wasm_v128_store32_lane(o, v0_12, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 13) { o = oN; } wasm_v128_store32_lane(o, v0_11, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 11) { o = oN; } wasm_v128_store32_lane(o, v0_10, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 11) { o = oN; } wasm_v128_store32_lane(o, v0_9, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 9) { o = oN; } wasm_v128_store32_lane(o, v0_8, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 9) { o = oN; } wasm_v128_store32_lane(o, v0_7, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store32_lane(o, v0_6, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store32_lane(o, v0_5, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store32_lane(o, v0_4, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store32_lane(o, v0_3, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store32_lane(o, v0_2, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store32_lane(o, v0_1, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store32_lane(o, v0_0, 0); o += 4; v0_0 = wasm_u64x2_shr(v0_0, 32); v0_1 = wasm_u64x2_shr(v0_1, 32); v0_2 = wasm_u64x2_shr(v0_2, 32); v0_3 = wasm_u64x2_shr(v0_3, 32); v0_4 = wasm_u64x2_shr(v0_4, 32); v0_5 = wasm_u64x2_shr(v0_5, 32); v0_6 = wasm_u64x2_shr(v0_6, 32); v0_7 = wasm_u64x2_shr(v0_7, 32); v0_8 = wasm_u64x2_shr(v0_8, 32); v0_9 = wasm_u64x2_shr(v0_9, 32); v0_10 = wasm_u64x2_shr(v0_10, 32); v0_11 = wasm_u64x2_shr(v0_11, 32); v0_12 = wasm_u64x2_shr(v0_12, 32); v0_13 = wasm_u64x2_shr(v0_13, 32); v0_14 = wasm_u64x2_shr(v0_14, 32); v0_15 = wasm_u64x2_shr(v0_15, 32); } if (bh & 2) { o = (uint8_t*) ((uintptr_t) o + oN_stride); wasm_v128_store16_lane(o, v0_15, 0); uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 15) { o = oN; } wasm_v128_store16_lane(o, v0_14, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 15) { o = oN; } wasm_v128_store16_lane(o, v0_13, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 13) { o = oN; } wasm_v128_store16_lane(o, v0_12, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 13) { o = oN; } wasm_v128_store16_lane(o, v0_11, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 11) { o = oN; } wasm_v128_store16_lane(o, v0_10, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 11) { o = oN; } wasm_v128_store16_lane(o, v0_9, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 9) { o = oN; } wasm_v128_store16_lane(o, v0_8, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 9) { o = oN; } wasm_v128_store16_lane(o, v0_7, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store16_lane(o, v0_6, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store16_lane(o, v0_5, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store16_lane(o, v0_4, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store16_lane(o, v0_3, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store16_lane(o, v0_2, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store16_lane(o, v0_1, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store16_lane(o, v0_0, 0); o += 2; v0_0 = wasm_u32x4_shr(v0_0, 16); v0_1 = wasm_u32x4_shr(v0_1, 16); v0_2 = wasm_u32x4_shr(v0_2, 16); v0_3 = wasm_u32x4_shr(v0_3, 16); v0_4 = wasm_u32x4_shr(v0_4, 16); v0_5 = wasm_u32x4_shr(v0_5, 16); v0_6 = wasm_u32x4_shr(v0_6, 16); v0_7 = wasm_u32x4_shr(v0_7, 16); v0_8 = wasm_u32x4_shr(v0_8, 16); v0_9 = wasm_u32x4_shr(v0_9, 16); v0_10 = wasm_u32x4_shr(v0_10, 16); v0_11 = wasm_u32x4_shr(v0_11, 16); v0_12 = wasm_u32x4_shr(v0_12, 16); v0_13 = wasm_u32x4_shr(v0_13, 16); v0_14 = wasm_u32x4_shr(v0_14, 16); v0_15 = wasm_u32x4_shr(v0_15, 16); } if (bh & 1) { o = (uint8_t*) ((uintptr_t) o + oN_stride); wasm_v128_store8_lane(o, v0_15, 0); uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 15) { o = oN; } wasm_v128_store8_lane(o, v0_14, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 15) { o = oN; } wasm_v128_store8_lane(o, v0_13, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 13) { o = oN; } wasm_v128_store8_lane(o, v0_12, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 13) { o = oN; } wasm_v128_store8_lane(o, v0_11, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 11) { o = oN; } wasm_v128_store8_lane(o, v0_10, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 11) { o = oN; } wasm_v128_store8_lane(o, v0_9, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 9) { o = oN; } wasm_v128_store8_lane(o, v0_8, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 9) { o = oN; } wasm_v128_store8_lane(o, v0_7, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 7) { o = oN; } wasm_v128_store8_lane(o, v0_6, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 7) { o = oN; } wasm_v128_store8_lane(o, v0_5, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 5) { o = oN; } wasm_v128_store8_lane(o, v0_4, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 5) { o = oN; } wasm_v128_store8_lane(o, v0_3, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 3) { o = oN; } wasm_v128_store8_lane(o, v0_2, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width >= 3) { o = oN; } wasm_v128_store8_lane(o, v0_1, 0); oN = (uint8_t*) ((uintptr_t) o + minus_output_stride); if XNN_UNPREDICTABLE(block_width > 1) { o = oN; } wasm_v128_store8_lane(o, v0_0, 0); } } i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset); o = (uint8_t*) ((uintptr_t) o + output_reset); block_width = doz(block_width, tile_width); } while (block_width != 0); } void xnn_xx_fill_ukernel__wasmsimd_x64( size_t rows, size_t channels, void* output, size_t output_stride, const uint32_t fill_pattern) { assert(rows != 0); assert(channels != 0); const size_t output_increment = output_stride - channels; const v128_t vfill_pattern = wasm_i32x4_splat(fill_pattern); do { size_t c = channels; for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) { wasm_v128_store(output, vfill_pattern); wasm_v128_store((uint8_t*) output + 16, vfill_pattern); wasm_v128_store((uint8_t*) output + 32, vfill_pattern); wasm_v128_store((uint8_t*) output + 48, vfill_pattern); output = ((uint8_t*) output + 64); } for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) { wasm_v128_store(output, vfill_pattern); output = ((uint8_t*) output + 16); } if XNN_UNLIKELY(c != 0) { if XNN_LIKELY(c & (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vfill_pattern, 0); output = ((uint8_t*) output + 8); } if XNN_LIKELY(c & (4 * sizeof(uint8_t))) { unaligned_store_u32(output, fill_pattern); output = ((uint8_t*) output + 4); } uint32_t vfill_subpattern = fill_pattern; if XNN_LIKELY(c & (2 * sizeof(uint8_t))) { unaligned_store_u16(output, (uint16_t) vfill_subpattern); vfill_subpattern >>= 16; output = ((uint8_t*) output + 2); } if XNN_LIKELY(c & (1 * sizeof(uint8_t))) { *((uint8_t*) output) = (uint8_t) vfill_subpattern; output = ((uint8_t*) output + 1); } } output = (void*) ((uintptr_t) output + output_increment); } while (--rows != 0); } void xnn_xx_pad_ukernel__wasmsimd( size_t rows, size_t channels, size_t pre_padding, size_t post_padding, const void* input, size_t input_stride, void* output, size_t output_stride, const uint32_t fill_pattern) XNN_OOB_READS { const size_t input_increment = input_stride - channels; const size_t output_increment = output_stride - (pre_padding + channels + post_padding); const v128_t vfill_pattern = wasm_i32x4_splat((int32_t) fill_pattern); do { // Pre-pad input channels. size_t l = pre_padding; if XNN_LIKELY(l != 0) { for (; l >= 16 * sizeof(uint8_t); l -= 16 * sizeof(uint8_t)) { wasm_v128_store(output, vfill_pattern); output = (uint8_t*) output + 16; } if (l & (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vfill_pattern, 0); output = (uint8_t*) output + 8; } if (l & (4 * sizeof(uint8_t))) { unaligned_store_u32(output, fill_pattern); output = (uint8_t*) output + 4; } uint32_t vfill_subpattern = fill_pattern; if (l & (2 * sizeof(uint8_t))) { unaligned_store_u16(output, (uint16_t) vfill_subpattern); vfill_subpattern >>= 16; output = (uint8_t*) output + 2; } if (l & (1 * sizeof(uint8_t))) { *((uint8_t*) output) = (uint8_t) vfill_subpattern; output = (uint8_t*) output + 1; } } // Copy input channels. size_t c = channels; for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) { const v128_t vdata = wasm_v128_load(input); input = (const uint8_t*) input + 16; wasm_v128_store(output, vdata); output = (uint8_t*) output + 16; } if XNN_UNLIKELY(c != 0) { v128_t vdata = wasm_v128_load(input); input = (const void*) ((uintptr_t) input + c); if (c & (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vdata, 0); vdata = wasm_v64x2_shuffle(vdata, vdata, 1, 1); output = (uint8_t*) output + 8; } if (c & (4 * sizeof(uint8_t))) { wasm_v128_store32_lane(output, vdata, 0); vdata = wasm_u64x2_shr(vdata, 32); output = (uint8_t*) output + 4; } if (c & (2 * sizeof(uint8_t))) { wasm_v128_store16_lane(output, vdata, 0); vdata = wasm_u32x4_shr(vdata, 16); output = (uint8_t*) output + 2; } if (c & (1 * sizeof(uint8_t))) { wasm_v128_store8_lane(output, vdata, 0); output = (uint8_t*) output + 1; } } // Post-pad input channels. size_t r = post_padding; if XNN_LIKELY(r != 0) { for (; r >= 16 * sizeof(uint8_t); r -= 16 * sizeof(uint8_t)) { wasm_v128_store(output, vfill_pattern); output = (uint8_t*) output + 16; } if (r & (8 * sizeof(uint8_t))) { wasm_v128_store64_lane(output, vfill_pattern, 0); output = (uint8_t*) output + 8; } if (r & (4 * sizeof(uint8_t))) { unaligned_store_u32(output, fill_pattern); output = (uint8_t*) output + 4; } uint32_t vfill_subpattern = fill_pattern; if (r & (2 * sizeof(uint8_t))) { unaligned_store_u16(output, (uint16_t) vfill_subpattern); vfill_subpattern >>= 16; output = (uint8_t*) output + 2; } if (r & (1 * sizeof(uint8_t))) { *((uint8_t*) output) = (uint8_t) vfill_subpattern; output = (uint8_t*) output + 1; } } input = (const void*) ((uintptr_t) input + input_increment); output = (void*) ((uintptr_t) output + output_increment); } while (--rows != 0); }