// Copyright 2020 Google LLC | |
// | |
// This source code is licensed under the BSD-style license found in the | |
// LICENSE file in the root directory of this source tree. | |
#include <xnnpack/assembly.h> | |
.syntax unified | |
// void xnn_f32_gemm_ukernel_4x4__asm_aarch32_vfp_ld64( | |
// size_t mr, r0 | |
// size_t nc, r1 | |
// size_t kc, r2 -> r5 | |
// const float* a, r3 | |
// size_t a_stride, sp + 96 -> (r11) | |
// const float* w, sp + 100 -> r9 | |
// float* c, sp + 104 -> r6 | |
// size_t cm_stride, sp + 108 -> (r7) | |
// size_t cn_stride, sp + 112 -> r11 | |
// const union xnn_f32_default_params params) sp + 116 -> (r11) | |
// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. | |
// Register usage | |
// A0 r3 s0-s1 d0 | |
// A1 r12 s2-s3 d1 | |
// A2 r10 s4-s5 d2 | |
// A3 r0 s6-s7 d3 | |
// B r9 s8, s9, s10, s11 d4-d5 | |
// B s12, s13, s14, s15 d6-d7 | |
// C0 r6 s16-s17 d8 s18-s19 d9 | |
// C1 r4 s20-s21 d10 s22-s23 d11 | |
// C2 r8 s24-s25 d12 s26-s27 d13 | |
// C3 r7 s28-s29 d14 s30-s31 d15 | |
BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__asm_aarch32_vfp_ld64 | |
#ifndef __APPLE__ | |
.arch armv6 | |
.fpu vfp | |
#endif | |
# Push 96 bytes | |
PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 | |
VPUSH {d8-d15} // +64 = 96 | |
LDR r11, [sp, 96] // Load a_stride | |
LDRD r6, r7, [sp, 104] // Load c and cm_stride | |
# Clamp A and C pointers | |
CMP r0, 2 // if mr >= 2 | |
ADD r12, r3, r11 // a1 = a0 + a_stride | |
ADD r4, r6, r7 // c1 = c0 + cm_stride | |
MOVLO r12, r3 // a1 | |
MOVLO r4, r6 // c1 | |
LDR r9, [sp, 100] // Load w | |
// if mr > 2 | |
ADD r10, r12, r11 // a2 = a1 + a_stride | |
ADD r8, r4, r7 // c2 = c1 + cm_stride | |
MOVLS r10, r12 // a2 | |
MOVLS r8, r4 // c2 | |
CMP r0, 4 // if mr >=4 | |
ADD r0, r10, r11 // a3 = a2 + a_stride | |
ADD r7, r8, r7 // c3 = c2 + cm_stride | |
LDR r11, [sp, 112] // Load cn_stride | |
MOVLO r0, r10 // a3 | |
MOVLO r7, r8 // c3 | |
0: | |
# Load initial bias from w into accumulators | |
VLDM r9!, {d8-d9} // Bias | |
SUBS r5, r2, 8 | |
VMOV.F64 d10, d8 | |
VMOV.F64 d12, d8 | |
VMOV.F64 d14, d8 | |
VMOV.F64 d11, d9 | |
VMOV.F64 d13, d9 | |
VMOV.F64 d15, d9 | |
BLO 3f // less than 2 channels? | |
# Main loop - 2 floats of A (8 bytes) | |
1: | |
VLDM r3!, {d0} // A0 | |
VLDM r9!, {d4-d5} // B0 | |
VLDM r12!, {d1} // A1 | |
VLDM r10!, {d2} // A2 | |
VLDM r0!, {d3} // A3 | |
VMLA.F32 s16, s8, s0 | |
VMLA.F32 s17, s9, s0 | |
VMLA.F32 s20, s8, s2 | |
VMLA.F32 s21, s9, s2 | |
VMLA.F32 s24, s8, s4 | |
VMLA.F32 s25, s9, s4 | |
VMLA.F32 s28, s8, s6 | |
VMLA.F32 s29, s9, s6 | |
VLDM r9!, {d6-d7} // B1 | |
VMLA.F32 s18, s10, s0 | |
VMLA.F32 s19, s11, s0 | |
VMLA.F32 s22, s10, s2 | |
VMLA.F32 s23, s11, s2 | |
VMLA.F32 s26, s10, s4 | |
VMLA.F32 s27, s11, s4 | |
VMLA.F32 s30, s10, s6 | |
VMLA.F32 s31, s11, s6 | |
VMLA.F32 s16, s12, s1 | |
VMLA.F32 s17, s13, s1 | |
VMLA.F32 s20, s12, s3 | |
VMLA.F32 s21, s13, s3 | |
VMLA.F32 s24, s12, s5 | |
VMLA.F32 s25, s13, s5 | |
VMLA.F32 s28, s12, s7 | |
VMLA.F32 s29, s13, s7 | |
SUBS r5, r5, 8 | |
VMLA.F32 s18, s14, s1 | |
VMLA.F32 s19, s15, s1 | |
VMLA.F32 s22, s14, s3 | |
VMLA.F32 s23, s15, s3 | |
VMLA.F32 s26, s14, s5 | |
VMLA.F32 s27, s15, s5 | |
VMLA.F32 s30, s14, s7 | |
VMLA.F32 s31, s15, s7 | |
BHS 1b | |
# Is there a remainder?- 1 float of A (4 bytes) | |
TST r5, 4 | |
BNE 3f | |
2: | |
SUBS r1, r1, 4 | |
BLO 4f | |
# Store full 4 x 4 | |
VSTM r6, {d8-d9} | |
SUB r0, r0, r2 | |
ADD r6, r11 | |
VSTM r4, {d10-d11} | |
SUB r10, r10, r2 | |
ADD r4, r11 | |
VSTM r8, {d12-d13} | |
SUB r12, r12, r2 | |
ADD r8, r11 | |
VSTM r7, {d14-d15} | |
SUB r3, r3, r2 | |
ADD r7, r11 | |
BHI 0b | |
VPOP {d8-d15} | |
POP {r4, r5, r6, r7, r8, r9, r10, r11} | |
BX lr | |
3: | |
# Remainder- 1 float of A (4 bytes) | |
VLDM r3!, {s0} // A0 | |
VLDM r9!, {d6-d7} // B | |
VLDM r12!, {s1} // A1 | |
VLDM r10!, {s2} // A2 | |
VLDM r0!, {s3} // A3 | |
VMLA.F32 s16, s12, s0 | |
VMLA.F32 s17, s13, s0 | |
VMLA.F32 s18, s14, s0 | |
VMLA.F32 s19, s15, s0 | |
VMLA.F32 s20, s12, s1 | |
VMLA.F32 s21, s13, s1 | |
VMLA.F32 s22, s14, s1 | |
VMLA.F32 s23, s15, s1 | |
VMLA.F32 s24, s12, s2 | |
VMLA.F32 s25, s13, s2 | |
VMLA.F32 s26, s14, s2 | |
VMLA.F32 s27, s15, s2 | |
VMLA.F32 s28, s12, s3 | |
VMLA.F32 s29, s13, s3 | |
VMLA.F32 s30, s14, s3 | |
VMLA.F32 s31, s15, s3 | |
B 2b | |
# Store odd width | |
4: | |
TST r1, 2 | |
BEQ 5f | |
VSTM r6!, {d8} | |
VMOV.F32 s16, s18 | |
VSTM r4!, {d10} | |
VMOV.F32 s20, s22 | |
VSTM r8!, {d12} | |
VMOV.F32 s24, s26 | |
VSTM r7!, {d14} | |
VMOV.F32 s28, s30 | |
5: | |
TST r1, 1 | |
BEQ 6f | |
VSTR s16, [r6] | |
VSTR s20, [r4] | |
VSTR s24, [r8] | |
VSTR s28, [r7] | |
6: | |
VPOP {d8-d15} | |
POP {r4, r5, r6, r7, r8, r9, r10, r11} | |
BX lr | |
END_FUNCTION xnn_f32_gemm_ukernel_4x4__asm_aarch32_vfp_ld64 | |
#ifdef __ELF__ | |
.section ".note.GNU-stack","",%progbits | |
#endif | |