|
// Copyright 2019 Google LLC |
|
// |
|
// This source code is licensed under the BSD-style license found in the |
|
// LICENSE file in the root directory of this source tree. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55 |
|
|
|
|
|
LDP x17, x16, [sp] |
|
|
|
|
|
STP x19, x20, [sp, -64]! |
|
STP d10, d11, [sp, 16] |
|
STP d12, d13, [sp, 32] |
|
STP d14, d15, [sp, 48] |
|
|
|
|
|
LD2R {v30.4s, v31.4s}, [x16] |
|
|
|
0: |
|
|
|
LDP x8, x9, [x2] |
|
LDP x10, x11, [x2, 16] |
|
LDP x12, x13, [x2, 32] |
|
LDP x14, x15, [x2, 48] |
|
LDR x16, [x2, 64] |
|
|
|
CMP x8, x17 // if i0 == zero |
|
ADD x8, x8, x7 // i0 += input_offset |
|
CSEL x8, x17, x8, EQ // i0 = zero, else += i0 + input_offset |
|
CMP x9, x17 // if i1 == zero |
|
ADD x9, x9, x7 // i1 += input_offset |
|
CSEL x9, x17, x9, EQ // i1 = zero, else += i1 + input_offset |
|
CMP x10, x17 // if i2 == zero |
|
ADD x10, x10, x7 // i2 += input_offset |
|
CSEL x10, x17, x10, EQ // i2 = zero, else += i2 + input_offset |
|
CMP x11, x17 // if i3 == zero |
|
ADD x11, x11, x7 // i3 += input_offset |
|
CSEL x11, x17, x11, EQ // i3 = zero, else += i3 + input_offset |
|
CMP x12, x17 // if i4 == zero |
|
ADD x12, x12, x7 // i4 += input_offset |
|
CSEL x12, x17, x12, EQ // i4 = zero, else += i4 + input_offset |
|
CMP x13, x17 // if i5 == zero |
|
ADD x13, x13, x7 // i5 += input_offset |
|
CSEL x13, x17, x13, EQ // i5 = zero, else += i5 + input_offset |
|
CMP x14, x17 // if i6 == zero |
|
ADD x14, x14, x7 // i6 += input_offset |
|
CSEL x14, x17, x14, EQ // i6 = zero, else += i6 + input_offset |
|
CMP x15, x17 // if i7 == zero |
|
ADD x15, x15, x7 // i7 += input_offset |
|
CSEL x15, x17, x15, EQ // i7 = zero, else += i7 + input_offset |
|
CMP x16, x17 // if i8 == zero |
|
ADD x16, x16, x7 // i8 += input_offset |
|
CSEL x16, x17, x16, EQ // i8 = zero, else += i8 + input_offset |
|
|
|
|
|
ADD x2, x2, x5 |
|
|
|
|
|
|
|
SUBS x20, x0, 8 |
|
|
|
MOV x19, x3 |
|
|
|
|
|
B.LO 3f |
|
|
|
|
|
|
|
|
|
LD1 {v0.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v1.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v4.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v6.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v28.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v10.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v12.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v14.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v16.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v18.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v20.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v22.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v24.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v26.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v4.2S, v5.2S |
|
|
|
LD1 {v4.2S}, [x14], 8 |
|
|
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v6.2S, v7.2S |
|
|
|
LD1 {v6.2S}, [x14], 8 |
|
|
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v28.2S, v29.2S |
|
|
|
LD1 {v28.2S}, [x15], 8 |
|
|
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v10.2S, v11.2S |
|
|
|
LD1 {v10.2S}, [x15], 8 |
|
|
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v12.2S, v13.2S |
|
|
|
LD1 {v12.2S}, [x16], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v14.2S, v15.2S |
|
|
|
LD1 {v14.2S}, [x16], 8 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v2.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v3.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v16.2S, v17.2S |
|
|
|
LD1 {v16.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v18.2S, v19.2S |
|
|
|
LD1 {v18.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v20.2S, v21.2S |
|
|
|
LD1 {v20.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v22.2S, v23.2S |
|
|
|
LD1 {v22.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v24.2S, v25.2S |
|
|
|
LD1 {v24.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v26.2S, v27.2S |
|
|
|
LD1 {v26.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v4.2S, v5.2S |
|
|
|
LD1 {v4.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v6.2S, v7.2S |
|
|
|
LD1 {v6.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v28.2S, v29.2S |
|
|
|
LD1 {v28.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v10.2S, v11.2S |
|
|
|
LD1 {v10.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v12.2S, v13.2S |
|
|
|
LD1 {v12.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v14.2S, v15.2S |
|
|
|
LD1 {v14.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v16.2S, v17.2S |
|
|
|
LD1 {v16.2S}, [x14], 8 |
|
|
|
|
|
FMAX v0.2S, v0.2S, v30.2S |
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v18.2S, v19.2S |
|
|
|
LD1 {v18.2S}, [x14], 8 |
|
|
|
|
|
FMAX v1.2S, v1.2S, v30.2S |
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v20.2S, v21.2S |
|
|
|
LD1 {v20.2S}, [x15], 8 |
|
|
|
|
|
FMIN v0.2S, v0.2S, v31.2S |
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v22.2S, v23.2S |
|
|
|
LD1 {v22.2S}, [x15], 8 |
|
|
|
|
|
FMIN v1.2S, v1.2S, v31.2S |
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v24.2S, v25.2S |
|
|
|
LD1 {v24.2S}, [x16], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v26.2S, v27.2S |
|
|
|
LD1 {v26.2S}, [x16], 8 |
|
|
|
|
|
STP d0, d1, [x4], 16 |
|
|
|
|
|
SUBS x20, x20, 8 |
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
B.LO 2f |
|
|
|
1: |
|
|
|
|
|
|
|
LD1 {v0.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v1.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v4.2S, v5.2S |
|
|
|
LD1 {v4.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v6.2S, v7.2S |
|
|
|
LD1 {v6.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v28.2S, v29.2S |
|
|
|
LD1 {v28.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v10.2S, v11.2S |
|
|
|
LD1 {v10.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v12.2S, v13.2S |
|
|
|
LD1 {v12.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v14.2S, v15.2S |
|
|
|
LD1 {v14.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v16.2S, v17.2S |
|
|
|
LD1 {v16.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v18.2S, v19.2S |
|
|
|
LD1 {v18.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v20.2S, v21.2S |
|
|
|
LD1 {v20.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v22.2S, v23.2S |
|
|
|
LD1 {v22.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v24.2S, v25.2S |
|
|
|
LD1 {v24.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v26.2S, v27.2S |
|
|
|
LD1 {v26.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v4.2S, v5.2S |
|
|
|
LD1 {v4.2S}, [x14], 8 |
|
|
|
|
|
FMAX v2.2S, v2.2S, v30.2S |
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v6.2S, v7.2S |
|
|
|
LD1 {v6.2S}, [x14], 8 |
|
|
|
|
|
FMAX v3.2S, v3.2S, v30.2S |
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v28.2S, v29.2S |
|
|
|
LD1 {v28.2S}, [x15], 8 |
|
|
|
|
|
FMIN v2.2S, v2.2S, v31.2S |
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v10.2S, v11.2S |
|
|
|
LD1 {v10.2S}, [x15], 8 |
|
|
|
|
|
FMIN v3.2S, v3.2S, v31.2S |
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v12.2S, v13.2S |
|
|
|
LD1 {v12.2S}, [x16], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v14.2S, v15.2S |
|
|
|
LD1 {v14.2S}, [x16], 8 |
|
|
|
|
|
STP d2, d3, [x4], 16 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v2.2S}, [x19], 8 |
|
|
|
|
|
LD1 {v3.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v16.2S, v17.2S |
|
|
|
LD1 {v16.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v18.2S, v19.2S |
|
|
|
LD1 {v18.2S}, [x8], 8 |
|
|
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v20.2S, v21.2S |
|
|
|
LD1 {v20.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v22.2S, v23.2S |
|
|
|
LD1 {v22.2S}, [x9], 8 |
|
|
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v24.2S, v25.2S |
|
|
|
LD1 {v24.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v26.2S, v27.2S |
|
|
|
LD1 {v26.2S}, [x10], 8 |
|
|
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v4.2S, v5.2S |
|
|
|
LD1 {v4.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v5.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v6.2S, v7.2S |
|
|
|
LD1 {v6.2S}, [x11], 8 |
|
|
|
|
|
LD1 {v7.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v28.2S, v29.2S |
|
|
|
LD1 {v28.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v29.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v10.2S, v11.2S |
|
|
|
LD1 {v10.2S}, [x12], 8 |
|
|
|
|
|
LD1 {v11.2S}, [x19], 8 |
|
|
|
|
|
FMLA v0.2S, v12.2S, v13.2S |
|
|
|
LD1 {v12.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v13.2S}, [x19], 8 |
|
|
|
|
|
FMLA v1.2S, v14.2S, v15.2S |
|
|
|
LD1 {v14.2S}, [x13], 8 |
|
|
|
|
|
LD1 {v15.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v16.2S, v17.2S |
|
|
|
LD1 {v16.2S}, [x14], 8 |
|
|
|
|
|
FMAX v0.2S, v0.2S, v30.2S |
|
|
|
LD1 {v17.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v18.2S, v19.2S |
|
|
|
LD1 {v18.2S}, [x14], 8 |
|
|
|
|
|
FMAX v1.2S, v1.2S, v30.2S |
|
|
|
LD1 {v19.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v20.2S, v21.2S |
|
|
|
LD1 {v20.2S}, [x15], 8 |
|
|
|
|
|
FMIN v0.2S, v0.2S, v31.2S |
|
|
|
LD1 {v21.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v22.2S, v23.2S |
|
|
|
LD1 {v22.2S}, [x15], 8 |
|
|
|
|
|
FMIN v1.2S, v1.2S, v31.2S |
|
|
|
LD1 {v23.2S}, [x19], 8 |
|
|
|
|
|
FMLA v2.2S, v24.2S, v25.2S |
|
|
|
LD1 {v24.2S}, [x16], 8 |
|
|
|
|
|
LD1 {v25.2S}, [x19], 8 |
|
|
|
|
|
FMLA v3.2S, v26.2S, v27.2S |
|
|
|
LD1 {v26.2S}, [x16], 8 |
|
|
|
|
|
STP d0, d1, [x4], 16 |
|
|
|
|
|
SUBS x20, x20, 8 |
|
|
|
LD1 {v27.2S}, [x19], 8 |
|
|
|
B.HS 1b |
|
|
|
2: |
|
|
|
|
|
|
|
FMLA v2.2S, v4.2S, v5.2S |
|
|
|
|
|
FMLA v3.2S, v6.2S, v7.2S |
|
|
|
|
|
FMLA v2.2S, v28.2S, v29.2S |
|
|
|
|
|
FMLA v3.2S, v10.2S, v11.2S |
|
|
|
|
|
FMLA v2.2S, v12.2S, v13.2S |
|
|
|
|
|
FMLA v3.2S, v14.2S, v15.2S |
|
|
|
|
|
FMLA v2.2S, v16.2S, v17.2S |
|
|
|
|
|
FMLA v3.2S, v18.2S, v19.2S |
|
|
|
|
|
FMLA v2.2S, v20.2S, v21.2S |
|
|
|
|
|
FMLA v3.2S, v22.2S, v23.2S |
|
|
|
|
|
FMLA v2.2S, v24.2S, v25.2S |
|
|
|
|
|
FMLA v3.2S, v26.2S, v27.2S |
|
|
|
|
|
FMAX v2.2S, v2.2S, v30.2S |
|
|
|
|
|
FMAX v3.2S, v3.2S, v30.2S |
|
|
|
|
|
FMIN v2.2S, v2.2S, v31.2S |
|
|
|
|
|
FMIN v3.2S, v3.2S, v31.2S |
|
|
|
|
|
STP d2, d3, [x4], 16 |
|
|
|
3: |
|
|
|
TBZ x20, 2, 4f |
|
|
|
LDR q10, [x8], 16 // load 9 inputs |
|
LDP q0, q1, [x19], 32 // load bias and 9 weights |
|
LDR q11, [x9], 16 |
|
LDR q12, [x10], 16 |
|
LDR q13, [x11], 16 |
|
LDR q14, [x12], 16 |
|
LDR q15, [x13], 16 |
|
LDR q16, [x14], 16 |
|
LDR q17, [x15], 16 |
|
LDR q18, [x16], 16 |
|
LDP q2, q3, [x19], 32 |
|
LDP q4, q5, [x19], 32 |
|
LDP q6, q7, [x19], 32 |
|
LDP q28, q29, [x19], 32 |
|
|
|
FMLA v0.4S, v1.4S, v10.4S |
|
FMLA v0.4S, v2.4S, v11.4S |
|
FMLA v0.4S, v3.4S, v12.4S |
|
FMLA v0.4S, v4.4S, v13.4S |
|
FMLA v0.4S, v5.4S, v14.4S |
|
FMLA v0.4S, v6.4S, v15.4S |
|
FMLA v0.4S, v7.4S, v16.4S |
|
FMLA v0.4S, v28.4S, v17.4S |
|
FMLA v0.4S, v29.4S, v18.4S |
|
|
|
FMAX v0.4S, v0.4S, v30.4S |
|
FMIN v0.4S, v0.4S, v31.4S |
|
|
|
STR q0, [x4], 16 |
|
|
|
4: |
|
|
|
TST x20, 3 |
|
B.EQ 6f |
|
|
|
LDR q10, [x8], 16 // load 9 inputs |
|
LDP q0, q1, [x19], 32 // load bias and 9 weights |
|
LDR q11, [x9], 16 |
|
LDR q12, [x10], 16 |
|
LDR q13, [x11], 16 |
|
LDR q14, [x12], 16 |
|
LDR q15, [x13], 16 |
|
LDR q16, [x14], 16 |
|
LDR q17, [x15], 16 |
|
LDR q18, [x16], 16 |
|
LDP q2, q3, [x19], 32 |
|
LDP q4, q5, [x19], 32 |
|
LDP q6, q7, [x19], 32 |
|
LDP q28, q29, [x19], 32 |
|
|
|
FMLA v0.4S, v1.4S, v10.4S |
|
FMLA v0.4S, v2.4S, v11.4S |
|
FMLA v0.4S, v3.4S, v12.4S |
|
FMLA v0.4S, v4.4S, v13.4S |
|
FMLA v0.4S, v5.4S, v14.4S |
|
FMLA v0.4S, v6.4S, v15.4S |
|
FMLA v0.4S, v7.4S, v16.4S |
|
FMLA v0.4S, v28.4S, v17.4S |
|
FMLA v0.4S, v29.4S, v18.4S |
|
|
|
FMAX v0.4S, v0.4S, v30.4S |
|
FMIN v0.4S, v0.4S, v31.4S |
|
|
|
TBZ x20, 1, 5f |
|
|
|
STR d0, [x4], 8 |
|
DUP d0, v0.D[1] |
|
TBZ x20, 0, 6f |
|
5: |
|
STR s0, [x4], 4 |
|
6: |
|
|
|
SUBS x1, x1, 1 |
|
|
|
ADD x4, x4, x6 |
|
|
|
B.NE 0b |
|
|
|
|
|
LDP d14, d15, [sp, 48] |
|
LDP d12, d13, [sp, 32] |
|
LDP d10, d11, [sp, 16] |
|
LDP x19, x20, [sp], 64 |
|
RET |
|
|
|
END_FUNCTION xnn_f32_dwconv_minmax_ukernel_9p4c__asm_aarch64_neonfma_cortex_a55 |
|
|
|
|
|
.section ".note.GNU-stack","",%progbits |
|
|
|
|