|
|
|
|
|
|
|
|
|
|
|
#include <assert.h> |
|
#include <stddef.h> |
|
#include <stdint.h> |
|
|
|
#include <xnnpack/math.h> |
|
#include <xnnpack/fft.h> |
|
|
|
#include <arm_neon.h> |
|
|
|
void xnn_cs16_fftr_ukernel__neon_x4( |
|
size_t samples, |
|
int16_t* data, |
|
const int16_t* twiddle) |
|
{ |
|
assert(samples != 0); |
|
assert(samples % 8 == 0); |
|
assert(data != NULL); |
|
assert(twiddle != NULL); |
|
|
|
int16_t* dl = data; |
|
int16_t* dr = data + samples * 2; |
|
int32_t vdcr = (int32_t) dl[0]; |
|
int32_t vdci = (int32_t) dl[1]; |
|
|
|
vdcr = math_asr_s32(vdcr * 16383 + 16384, 15); |
|
vdci = math_asr_s32(vdci * 16383 + 16384, 15); |
|
|
|
dl[0] = vdcr + vdci; |
|
dl[1] = 0; |
|
dl += 2; |
|
dr[0] = vdcr - vdci; |
|
dr[1] = 0; |
|
|
|
const int16x4_t vdiv2 = vdup_n_s16(16383); |
|
|
|
do { |
|
dr -= 8; |
|
const int16x4x2_t vil = vld2_s16(dl); |
|
const int16x4x2_t vir = vld2_s16(dr); |
|
const int16x4x2_t vtw = vld2_s16(twiddle); twiddle += 8; |
|
|
|
int16x4_t virr = vrev64_s16(vir.val[0]); |
|
int16x4_t viri = vrev64_s16(vir.val[1]); |
|
|
|
const int16x4_t vilr = vqrdmulh_s16(vil.val[0], vdiv2); |
|
const int16x4_t vili = vqrdmulh_s16(vil.val[1], vdiv2); |
|
virr = vqrdmulh_s16(virr, vdiv2); |
|
viri = vqrdmulh_s16(viri, vdiv2); |
|
|
|
const int32x4_t vacc1r = vaddl_s16(vilr, virr); |
|
const int32x4_t vacc1i = vsubl_s16(vili, viri); |
|
const int16x4_t vacc2r = vsub_s16(vilr, virr); |
|
const int16x4_t vacc2i = vadd_s16(vili, viri); |
|
|
|
int32x4_t vaccr = vmull_s16(vacc2r, vtw.val[0]); |
|
int32x4_t vacci = vmull_s16(vacc2r, vtw.val[1]); |
|
vaccr = vmlsl_s16(vaccr, vacc2i, vtw.val[1]); |
|
vacci = vmlal_s16(vacci, vacc2i, vtw.val[0]); |
|
vaccr = vrshrq_n_s32(vaccr, 15); |
|
vacci = vrshrq_n_s32(vacci, 15); |
|
|
|
const int32x4_t vacclr = vhaddq_s32(vacc1r, vaccr); |
|
const int32x4_t vaccli = vhaddq_s32(vacc1i, vacci); |
|
const int32x4_t vaccrr = vhsubq_s32(vacc1r, vaccr); |
|
const int32x4_t vaccri = vhsubq_s32(vacci, vacc1i); |
|
|
|
int16x4x2_t voutl; |
|
int16x4x2_t voutr; |
|
voutl.val[0] = vmovn_s32(vacclr); |
|
voutl.val[1] = vmovn_s32(vaccli); |
|
voutr.val[0] = vrev64_s16(vmovn_s32(vaccrr)); |
|
voutr.val[1] = vrev64_s16(vmovn_s32(vaccri)); |
|
|
|
vst2_s16(dl, voutl); |
|
vst2_s16(dr, voutr); |
|
dl += 8; |
|
|
|
samples -= 8; |
|
} while(samples != 0); |
|
} |
|
|