|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm" |
|
|
|
%define private_prefix ff_tx |
|
|
|
%if ARCH_X86_64 |
|
%define ptr resq |
|
%else |
|
%define ptr resd |
|
%endif |
|
|
|
%assign i 16 |
|
%rep 14 |
|
cextern tab_ %+ i %+ _float |
|
%assign i (i << 1) |
|
%endrep |
|
|
|
cextern tab_53_float |
|
|
|
struc AVTXContext |
|
.len: resd 1 |
|
.inv resd 1 |
|
.map: ptr 1 |
|
.exp: ptr 1 |
|
.tmp: ptr 1 |
|
|
|
.sub: ptr 1 |
|
.fn: ptr 4 |
|
.nb_sub: resd 1 |
|
|
|
|
|
endstruc |
|
|
|
SECTION_RODATA 32 |
|
|
|
%define POS 0x00000000 |
|
%define NEG 0x80000000 |
|
|
|
%define M_SQRT1_2 0.707106781186547524401 |
|
%define COS16_1 0.92387950420379638671875 |
|
%define COS16_3 0.3826834261417388916015625 |
|
|
|
d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \ |
|
M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 |
|
|
|
s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 |
|
s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0 |
|
s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3 |
|
s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1 |
|
|
|
s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 |
|
s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 |
|
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 |
|
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 |
|
|
|
s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1 |
|
|
|
mask_mmppmmmm: dd NEG, NEG, POS, POS, NEG, NEG, NEG, NEG |
|
mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG |
|
mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG |
|
mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS |
|
mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG |
|
mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS |
|
mask_pmpmpmpm: times 4 dd POS, NEG |
|
|
|
SECTION .text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro LOAD64_LUT 5-7 |
|
%if %0 > 6 && cpuflag(avx2) |
|
pcmpeqd %7, %7 |
|
movupd xmm%6, [%3 + %4] |
|
vgatherdpd %1, [%2 + xmm%6*8], %7 |
|
%else |
|
mov %5d, [%3 + %4 + 0] |
|
movsd xmm%1, [%2 + %5q*8] |
|
%if sizeof%1 > 16 && %0 > 5 |
|
mov %5d, [%3 + %4 + 8] |
|
movsd xmm%6, [%2 + %5q*8] |
|
%endif |
|
mov %5d, [%3 + %4 + 4] |
|
movhps xmm%1, [%2 + %5q*8] |
|
%if sizeof%1 > 16 && %0 > 5 |
|
mov %5d, [%3 + %4 + 12] |
|
movhps xmm%6, [%2 + %5q*8] |
|
vinsertf128 %1, %1, xmm%6, 1 |
|
%endif |
|
%endif |
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro FFT2 2 |
|
shufps %2, %1, %1, q3322 |
|
shufps %1, %1, %1, q1100 |
|
|
|
addsubps %1, %1, %2 |
|
|
|
shufps %1, %1, %1, q2031 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro FFT4 3 |
|
subps %3, %1, %2 |
|
addps %1, %1, %2 |
|
|
|
shufps %2, %1, %3, q1010 |
|
shufps %1, %1, %3, q2332 |
|
|
|
subps %3, %2, %1 |
|
addps %2, %2, %1 |
|
|
|
shufps %1, %2, %3, q1010 |
|
|
|
shufps %2, %2, %3, q2332 |
|
shufps %2, %2, %2, q1320 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro FFT8 6 |
|
addps %5, %1, %3 |
|
addps %6, %2, %4 |
|
|
|
subps %1, %1, %3 |
|
subps %2, %2, %4 |
|
|
|
shufps %4, %1, %1, q2323 |
|
shufps %3, %5, %6, q3032 |
|
|
|
shufps %1, %1, %1, q1010 |
|
shufps %5, %5, %6, q1210 |
|
|
|
xorps %4, %4, [mask_pmmppmmp] |
|
addps %6, %5, %3 |
|
|
|
mulps %2, %2, [d8_mult_odd] |
|
subps %5, %5, %3 |
|
|
|
addps %3, %1, %4 |
|
unpcklpd %1, %6, %5 |
|
|
|
shufps %4, %2, %2, q2301 |
|
shufps %6, %6, %5, q2332 |
|
|
|
addsubps %2, %2, %4 |
|
shufps %5, %2, %2, q0123 |
|
addsubps %5, %5, %2 |
|
|
|
subps %2, %1, %6 |
|
subps %4, %3, %5 |
|
|
|
addps %1, %1, %6 |
|
addps %3, %3, %5 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro FFT8_AVX 4 |
|
subps %3, %1, %2 |
|
addps %1, %1, %2 |
|
|
|
vpermilps %2, %3, [s8_perm_odd1] |
|
shufps %4, %1, %1, q3322 |
|
|
|
movsldup %3, %3 |
|
shufps %1, %1, %1, q1100 |
|
|
|
addsubps %3, %3, %2 |
|
addsubps %1, %1, %4 |
|
|
|
mulps %3, %3, [s8_mult_odd] |
|
vpermilps %1, %1, [s8_perm_even] |
|
|
|
shufps %2, %3, %3, q2332 |
|
xorps %4, %1, [mask_mmmmpppm] |
|
|
|
vpermilps %3, %3, [s8_perm_odd2] |
|
vperm2f128 %1, %1, %4, 0x03 |
|
|
|
addsubps %2, %2, %3 |
|
subps %1, %1, %4 |
|
|
|
vperm2f128 %2, %2, %2, 0x11 |
|
vperm2f128 %3, %3, %3, 0x00 |
|
|
|
xorps %2, %2, [mask_ppmpmmpm] |
|
addps %2, %3, %2 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro FFT16 6-8 |
|
FFT4 %3, %4, %5 |
|
%if %0 > 7 |
|
FFT8_AVX %1, %2, %6, %7 |
|
movaps %8, [mask_mpmppmpm] |
|
movaps %7, [s16_perm] |
|
%define mask %8 |
|
%define perm %7 |
|
%elif %0 > 6 |
|
FFT8_AVX %1, %2, %6, %7 |
|
movaps %7, [s16_perm] |
|
%define mask [mask_mpmppmpm] |
|
%define perm %7 |
|
%else |
|
FFT8_AVX %1, %2, %6, %5 |
|
%define mask [mask_mpmppmpm] |
|
%define perm [s16_perm] |
|
%endif |
|
xorps %5, %5, %5 |
|
|
|
shufps %6, %4, %4, q2301 |
|
shufps %5, %5, %3, q2301 |
|
|
|
mulps %4, %4, [s16_mult_odd1] |
|
xorps %5, %5, [mask_mppmmpmp] |
|
%if cpuflag(fma3) |
|
fmaddps %6, %6, [s16_mult_odd2], %4 |
|
addps %5, %3, %5 |
|
%else |
|
mulps %6, %6, [s16_mult_odd2] |
|
|
|
addps %5, %3, %5 |
|
addps %6, %4, %6 |
|
%endif |
|
mulps %5, %5, [s16_mult_even] |
|
|
|
xorps %4, %6, mask |
|
xorps %3, %5, mask |
|
|
|
vperm2f128 %4, %4, %4, 0x01 |
|
vperm2f128 %3, %3, %3, 0x01 |
|
|
|
addps %6, %6, %4 |
|
addps %5, %5, %3 |
|
|
|
vpermilps %6, %6, perm |
|
vpermilps %5, %5, perm |
|
|
|
subps %4, %2, %6 |
|
addps %3, %2, %6 |
|
|
|
subps %2, %1, %5 |
|
addps %1, %1, %5 |
|
%undef mask |
|
%undef perm |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%macro FFT15 0 |
|
shufps xm1, xm0, xm0, q3223 |
|
shufps xm0, xm0, xm0, q1001 |
|
|
|
xorps xm1, xm11 |
|
addps xm1, xm0 |
|
|
|
shufps xm0, xm1, xm1, q3232 |
|
addps xm0, xm5 |
|
|
|
mulps xm1, xm9 |
|
|
|
shufpd xm6, xm1, xm1, 01b |
|
xorps xm1, xm11 |
|
addps xm1, xm1, xm6 |
|
addsubps xm1, xm5, xm1 |
|
|
|
subps m7, m2, m3 |
|
addps m6, m2, m3 |
|
shufps m7, m7, m7, q2301 |
|
|
|
addps m5, m4, m6 |
|
|
|
vperm2f128 m14, m9, m9, 0x11 |
|
vbroadcastsd m15, xm9 |
|
|
|
mulps m6, m14 |
|
mulps m7, m15 |
|
|
|
subps m2, m6, m7 |
|
addps m3, m6, m7 |
|
|
|
shufps m12, m11, m11, q3232 |
|
|
|
addsubps m6, m4, m2 |
|
addsubps m7, m4, m3 |
|
|
|
|
|
vpermpd m2, m5, q0123 |
|
vpermpd m3, m6, q0123 |
|
vpermpd m4, m7, q0123 |
|
|
|
xorps m5, m12 |
|
xorps m6, m12 |
|
xorps m7, m12 |
|
|
|
addps m2, m5 |
|
addps m3, m6 |
|
addps m4, m7 |
|
|
|
movlhps xm14, xm2 |
|
unpcklpd xm15, xm3, xm4 |
|
unpckhpd xm5, xm3, xm4 |
|
|
|
addps xm14, xm2 |
|
addps xm15, xm5 |
|
addps xm14, xm0 |
|
addps xm15, xm1 |
|
|
|
shufps m12, m10, m10, q3232 |
|
shufps m13, m10, m10, q1010 |
|
|
|
mulps m5, m2, m12 |
|
mulps m6, m3, m12 |
|
mulps m7, m4, m12 |
|
|
|
mulps m2, m13 |
|
mulps m3, m13 |
|
mulps m4, m13 |
|
|
|
shufps m5, m5, m5, q1032 |
|
shufps m6, m6, m6, q1032 |
|
shufps m7, m7, m7, q1032 |
|
|
|
vperm2f128 m13, m11, m11, 0x01 |
|
shufps m12, m11, m11, q3232 |
|
|
|
xorps m5, m13 |
|
xorps m6, m13 |
|
xorps m7, m13 |
|
|
|
addps m2, m5 |
|
addps m3, m6 |
|
addps m4, m7 |
|
|
|
shufps m5, m2, m2, q2301 |
|
shufps m6, m3, m3, q2301 |
|
shufps m7, m4, m4, q2301 |
|
|
|
xorps m2, m12 |
|
xorps m3, m12 |
|
xorps m4, m12 |
|
|
|
vpermpd m5, m5, q0123 |
|
vpermpd m6, m6, q0123 |
|
vpermpd m7, m7, q0123 |
|
|
|
addps m5, m2 |
|
addps m6, m3 |
|
addps m7, m4 |
|
|
|
vpermps m5, m8, m5 |
|
vpermps m6, m8, m6 |
|
vpermps m7, m8, m7 |
|
|
|
vbroadcastsd m0, xm0 |
|
vpermpd m2, m1, q1111 |
|
vbroadcastsd m1, xm1 |
|
|
|
addps m0, m5 |
|
addps m1, m6 |
|
addps m2, m7 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro SPLIT_RADIX_COMBINE 17 |
|
%if %1 && mmsize == 32 |
|
vperm2f128 %14, %6, %7, 0x20 |
|
vperm2f128 %16, %9, %8, 0x20 |
|
vperm2f128 %15, %6, %7, 0x31 |
|
vperm2f128 %17, %9, %8, 0x31 |
|
%endif |
|
|
|
shufps %12, %10, %10, q2200 |
|
shufps %13, %11, %11, q1133 |
|
movshdup %10, %10 |
|
shufps %11, %11, %11, q0022 |
|
|
|
%if %1 && mmsize == 32 |
|
shufps %6, %14, %14, q2301 |
|
shufps %8, %16, %16, q2301 |
|
shufps %7, %15, %15, q2301 |
|
shufps %9, %17, %17, q2301 |
|
|
|
mulps %14, %14, %13 |
|
mulps %16, %16, %11 |
|
mulps %15, %15, %13 |
|
mulps %17, %17, %11 |
|
%else |
|
mulps %14, %6, %13 |
|
mulps %16, %8, %11 |
|
mulps %15, %7, %13 |
|
mulps %17, %9, %11 |
|
|
|
shufps %6, %6, %6, q2301 |
|
shufps %8, %8, %8, q2301 |
|
shufps %7, %7, %7, q2301 |
|
shufps %9, %9, %9, q2301 |
|
%endif |
|
|
|
%if cpuflag(fma3) |
|
fmaddsubps %6, %6, %12, %14 |
|
fmaddsubps %8, %8, %10, %16 |
|
fmsubaddps %7, %7, %12, %15 |
|
fmsubaddps %9, %9, %10, %17 |
|
movaps %13, [mask_pmpmpmpm] |
|
%else |
|
mulps %6, %6, %12 |
|
mulps %8, %8, %10 |
|
movaps %13, [mask_pmpmpmpm] |
|
mulps %7, %7, %12 |
|
mulps %9, %9, %10 |
|
addsubps %6, %6, %14 |
|
addsubps %8, %8, %16 |
|
xorps %15, %15, %13 |
|
xorps %17, %17, %13 |
|
addps %7, %7, %15 |
|
addps %9, %9, %17 |
|
%endif |
|
|
|
addps %14, %6, %7 |
|
addps %16, %8, %9 |
|
subps %15, %6, %7 |
|
subps %17, %8, %9 |
|
|
|
shufps %14, %14, %14, q2301 |
|
shufps %16, %16, %16, q2301 |
|
xorps %15, %15, %13 |
|
xorps %17, %17, %13 |
|
|
|
subps %6, %2, %14 |
|
subps %8, %4, %16 |
|
subps %7, %3, %15 |
|
subps %9, %5, %17 |
|
|
|
addps %2, %2, %14 |
|
addps %4, %4, %16 |
|
addps %3, %3, %15 |
|
addps %5, %5, %17 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
%macro SPLIT_RADIX_COMBINE_HALF 10 |
|
%if %1 |
|
shufps %8, %6, %6, q2200 |
|
shufps %9, %7, %7, q1133 |
|
%else |
|
shufps %8, %6, %6, q3311 |
|
shufps %9, %7, %7, q0022 |
|
%endif |
|
|
|
mulps %10, %4, %9 |
|
mulps %9, %9, %5 |
|
|
|
shufps %4, %4, %4, q2301 |
|
shufps %5, %5, %5, q2301 |
|
|
|
%if cpuflag(fma3) |
|
fmaddsubps %4, %4, %8, %10 |
|
fmsubaddps %5, %5, %8, %9 |
|
movaps %10, [mask_pmpmpmpm] |
|
%else |
|
mulps %4, %4, %8 |
|
mulps %5, %5, %8 |
|
addsubps %4, %4, %10 |
|
movaps %10, [mask_pmpmpmpm] |
|
xorps %9, %9, %10 |
|
addps %5, %5, %9 |
|
%endif |
|
|
|
addps %8, %4, %5 |
|
subps %9, %4, %5 |
|
|
|
shufps %8, %8, %8, q2301 |
|
xorps %9, %9, %10 |
|
|
|
subps %4, %2, %8 |
|
subps %5, %3, %9 |
|
|
|
addps %2, %2, %8 |
|
addps %3, %3, %9 |
|
%endmacro |
|
|
|
|
|
%macro SPLIT_RADIX_COMBINE_LITE 9 |
|
%if %1 |
|
shufps %8, %6, %6, q2200 |
|
shufps %9, %7, %7, q1133 |
|
%else |
|
shufps %8, %6, %6, q3311 |
|
shufps %9, %7, %7, q0022 |
|
%endif |
|
|
|
mulps %9, %9, %4 |
|
shufps %4, %4, %4, q2301 |
|
|
|
%if cpuflag(fma3) |
|
fmaddsubps %4, %4, %8, %9 |
|
%else |
|
mulps %4, %4, %8 |
|
addsubps %4, %4, %9 |
|
%endif |
|
|
|
%if %1 |
|
shufps %9, %7, %7, q1133 |
|
%else |
|
shufps %9, %7, %7, q0022 |
|
%endif |
|
|
|
mulps %9, %9, %5 |
|
shufps %5, %5, %5, q2301 |
|
%if cpuflag (fma3) |
|
fmsubaddps %5, %5, %8, %9 |
|
%else |
|
mulps %5, %5, %8 |
|
xorps %9, %9, [mask_pmpmpmpm] |
|
addps %5, %5, %9 |
|
%endif |
|
|
|
addps %8, %4, %5 |
|
subps %9, %4, %5 |
|
|
|
shufps %8, %8, %8, q2301 |
|
xorps %9, %9, [mask_pmpmpmpm] |
|
|
|
subps %4, %2, %8 |
|
subps %5, %3, %9 |
|
|
|
addps %2, %2, %8 |
|
addps %3, %3, %9 |
|
%endmacro |
|
|
|
%macro SPLIT_RADIX_COMBINE_64 0 |
|
SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 |
|
|
|
movaps [outq + 0*mmsize], m0 |
|
movaps [outq + 4*mmsize], m1 |
|
movaps [outq + 8*mmsize], tx1_e0 |
|
movaps [outq + 12*mmsize], tx2_e0 |
|
|
|
SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0 |
|
|
|
movaps [outq + 2*mmsize], m2 |
|
movaps [outq + 6*mmsize], m3 |
|
movaps [outq + 10*mmsize], tx1_o0 |
|
movaps [outq + 14*mmsize], tx2_o0 |
|
|
|
movaps tw_e, [tab_64_float + mmsize] |
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
movaps m0, [outq + 1*mmsize] |
|
movaps m1, [outq + 3*mmsize] |
|
movaps m2, [outq + 5*mmsize] |
|
movaps m3, [outq + 7*mmsize] |
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ |
|
tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 |
|
|
|
movaps [outq + 1*mmsize], m0 |
|
movaps [outq + 3*mmsize], m1 |
|
movaps [outq + 5*mmsize], m2 |
|
movaps [outq + 7*mmsize], m3 |
|
|
|
movaps [outq + 9*mmsize], tx1_e1 |
|
movaps [outq + 11*mmsize], tx1_o1 |
|
movaps [outq + 13*mmsize], tx2_e1 |
|
movaps [outq + 15*mmsize], tx2_o1 |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro SPLIT_RADIX_LOAD_COMBINE_4 8 |
|
movaps m8, [rtabq + (%5)*mmsize + %7] |
|
vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 |
|
|
|
movaps m0, [outq + (0 + %4)*mmsize + %6] |
|
movaps m2, [outq + (2 + %4)*mmsize + %6] |
|
movaps m1, [outq + %1 + (0 + %4)*mmsize + %6] |
|
movaps m3, [outq + %1 + (2 + %4)*mmsize + %6] |
|
|
|
movaps m4, [outq + %2 + (0 + %4)*mmsize + %6] |
|
movaps m6, [outq + %2 + (2 + %4)*mmsize + %6] |
|
movaps m5, [outq + %3 + (0 + %4)*mmsize + %6] |
|
movaps m7, [outq + %3 + (2 + %4)*mmsize + %6] |
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ |
|
m4, m5, m6, m7, \ |
|
m8, m9, \ |
|
m10, m11, m12, m13, m14, m15 |
|
|
|
movaps [outq + (0 + %4)*mmsize + %6], m0 |
|
movaps [outq + (2 + %4)*mmsize + %6], m2 |
|
movaps [outq + %1 + (0 + %4)*mmsize + %6], m1 |
|
movaps [outq + %1 + (2 + %4)*mmsize + %6], m3 |
|
|
|
movaps [outq + %2 + (0 + %4)*mmsize + %6], m4 |
|
movaps [outq + %2 + (2 + %4)*mmsize + %6], m6 |
|
movaps [outq + %3 + (0 + %4)*mmsize + %6], m5 |
|
movaps [outq + %3 + (2 + %4)*mmsize + %6], m7 |
|
%endmacro |
|
|
|
%macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5 |
|
%if %0 > 2 |
|
%define offset_c %3 |
|
%else |
|
%define offset_c 0 |
|
%endif |
|
%if %0 > 3 |
|
%define offset_r %4 |
|
%else |
|
%define offset_r 0 |
|
%endif |
|
%if %0 > 4 |
|
%define offset_i %5 |
|
%else |
|
%define offset_i 0 |
|
%endif |
|
|
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i |
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i |
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i |
|
SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i |
|
%endmacro |
|
|
|
|
|
|
|
|
|
|
|
%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 |
|
movaps m8, [rtabq + (0 + %2)*mmsize] |
|
vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 |
|
|
|
movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] |
|
movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] |
|
movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6] |
|
movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6] |
|
|
|
movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6] |
|
movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6] |
|
movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6] |
|
movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6] |
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ |
|
m4, m5, m6, m7, \ |
|
m8, m9, \ |
|
m10, m11, m12, m13, m14, m15 |
|
|
|
unpckhpd m10, m0, m2 |
|
unpckhpd m11, m1, m3 |
|
unpckhpd m12, m4, m6 |
|
unpckhpd m13, m5, m7 |
|
unpcklpd m0, m0, m2 |
|
unpcklpd m1, m1, m3 |
|
unpcklpd m4, m4, m6 |
|
unpcklpd m5, m5, m7 |
|
|
|
vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0 |
|
vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0 |
|
vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0 |
|
vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0 |
|
|
|
vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0 |
|
vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0 |
|
vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0 |
|
vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0 |
|
|
|
vperm2f128 m10, m10, m0, 0x13 |
|
vperm2f128 m11, m11, m1, 0x13 |
|
vperm2f128 m12, m12, m4, 0x13 |
|
vperm2f128 m13, m13, m5, 0x13 |
|
|
|
movaps m8, [rtabq + (1 + %2)*mmsize] |
|
vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 |
|
|
|
movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] |
|
movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] |
|
movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6] |
|
movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6] |
|
|
|
movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 |
|
movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 |
|
|
|
movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6] |
|
movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6] |
|
movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6] |
|
movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6] |
|
|
|
movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 |
|
movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 |
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ |
|
m4, m5, m6, m7, \ |
|
m8, m9, \ |
|
m10, m11, m12, m13, m14, m15 |
|
|
|
unpcklpd m8, m0, m2 |
|
unpcklpd m9, m1, m3 |
|
unpcklpd m10, m4, m6 |
|
unpcklpd m11, m5, m7 |
|
unpckhpd m0, m0, m2 |
|
unpckhpd m1, m1, m3 |
|
unpckhpd m4, m4, m6 |
|
unpckhpd m5, m5, m7 |
|
|
|
vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0 |
|
vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0 |
|
vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1 |
|
vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1 |
|
|
|
vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0 |
|
vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0 |
|
vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1 |
|
vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1 |
|
|
|
vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0 |
|
vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0 |
|
vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1 |
|
vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1 |
|
|
|
vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0 |
|
vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0 |
|
vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1 |
|
vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1 |
|
%endmacro |
|
|
|
%macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3 |
|
%if %0 > 2 |
|
%define offset %3 |
|
%else |
|
%define offset 0 |
|
%endif |
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset |
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset |
|
%endmacro |
|
|
|
INIT_XMM sse3 |
|
cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride |
|
movaps m0, [inq] |
|
FFT2 m0, m1 |
|
movaps [outq], m0 |
|
ret |
|
|
|
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride |
|
movaps m0, [inq] |
|
FFT2 m0, m1 |
|
movaps [outq], m0 |
|
RET |
|
|
|
%macro FFT4_FN 3 |
|
INIT_XMM sse2 |
|
%if %3 |
|
cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride |
|
%else |
|
cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride |
|
%endif |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
|
|
%if %2 |
|
shufps m2, m1, m0, q3210 |
|
shufps m0, m0, m1, q3210 |
|
movaps m1, m2 |
|
%endif |
|
|
|
FFT4 m0, m1, m2 |
|
|
|
unpcklpd m2, m0, m1 |
|
unpckhpd m0, m0, m1 |
|
|
|
movaps [outq + 0*mmsize], m2 |
|
movaps [outq + 1*mmsize], m0 |
|
|
|
%if %3 |
|
ret |
|
%else |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
FFT4_FN fwd, 0, 0 |
|
FFT4_FN fwd, 0, 1 |
|
FFT4_FN inv, 1, 0 |
|
FFT4_FN inv, 1, 1 |
|
|
|
%macro FFT8_SSE_FN 1 |
|
INIT_XMM sse3 |
|
%if %1 |
|
cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
movaps m2, [inq + 2*mmsize] |
|
movaps m3, [inq + 3*mmsize] |
|
%else |
|
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp |
|
mov ctxq, [ctxq + AVTXContext.map] |
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq |
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq |
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq |
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq |
|
%endif |
|
|
|
FFT8 m0, m1, m2, m3, m4, m5 |
|
|
|
unpcklpd m4, m0, m3 |
|
unpcklpd m5, m1, m2 |
|
unpckhpd m0, m0, m3 |
|
unpckhpd m1, m1, m2 |
|
|
|
movups [outq + 0*mmsize], m4 |
|
movups [outq + 1*mmsize], m0 |
|
movups [outq + 2*mmsize], m5 |
|
movups [outq + 3*mmsize], m1 |
|
|
|
%if %1 |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %1 |
|
cglobal fft8_ns_float, 4, 5, 6, ctx, out, in, stride, tmp |
|
call mangle(ff_tx_fft8_asm_float_sse3) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
FFT8_SSE_FN 0 |
|
FFT8_SSE_FN 1 |
|
|
|
%macro FFT8_AVX_FN 1 |
|
INIT_YMM avx |
|
%if %1 |
|
cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
%else |
|
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp |
|
mov ctxq, [ctxq + AVTXContext.map] |
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 |
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 |
|
%endif |
|
|
|
FFT8_AVX m0, m1, m2, m3 |
|
|
|
unpcklpd m2, m0, m1 |
|
unpckhpd m0, m0, m1 |
|
|
|
|
|
vextractf128 [outq + 16*0], m2, 0 |
|
vextractf128 [outq + 16*1], m0, 0 |
|
vextractf128 [outq + 16*2], m2, 1 |
|
vextractf128 [outq + 16*3], m0, 1 |
|
|
|
%if %1 |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %1 |
|
cglobal fft8_ns_float, 4, 5, 4, ctx, out, in, stride, tmp |
|
call mangle(ff_tx_fft8_asm_float_avx) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
FFT8_AVX_FN 0 |
|
FFT8_AVX_FN 1 |
|
|
|
%macro FFT16_FN 2 |
|
INIT_YMM %1 |
|
%if %2 |
|
cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
movaps m2, [inq + 2*mmsize] |
|
movaps m3, [inq + 3*mmsize] |
|
%else |
|
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp |
|
mov ctxq, [ctxq + AVTXContext.map] |
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 |
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 |
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 |
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 |
|
%endif |
|
|
|
FFT16 m0, m1, m2, m3, m4, m5, m6, m7 |
|
|
|
unpcklpd m5, m1, m3 |
|
unpcklpd m4, m0, m2 |
|
unpckhpd m1, m1, m3 |
|
unpckhpd m0, m0, m2 |
|
|
|
vextractf128 [outq + 16*0], m4, 0 |
|
vextractf128 [outq + 16*1], m0, 0 |
|
vextractf128 [outq + 16*2], m4, 1 |
|
vextractf128 [outq + 16*3], m0, 1 |
|
vextractf128 [outq + 16*4], m5, 0 |
|
vextractf128 [outq + 16*5], m1, 0 |
|
vextractf128 [outq + 16*6], m5, 1 |
|
vextractf128 [outq + 16*7], m1, 1 |
|
|
|
%if %2 |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %2 |
|
cglobal fft16_ns_float, 4, 5, 8, ctx, out, in, stride, tmp |
|
call mangle(ff_tx_fft16_asm_float_ %+ %1) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
FFT16_FN avx, 0 |
|
FFT16_FN avx, 1 |
|
FFT16_FN fma3, 0 |
|
FFT16_FN fma3, 1 |
|
|
|
%macro FFT32_FN 2 |
|
INIT_YMM %1 |
|
%if %2 |
|
cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp |
|
movaps m4, [inq + 4*mmsize] |
|
movaps m5, [inq + 5*mmsize] |
|
movaps m6, [inq + 6*mmsize] |
|
movaps m7, [inq + 7*mmsize] |
|
%else |
|
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp |
|
mov ctxq, [ctxq + AVTXContext.map] |
|
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 |
|
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 |
|
LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14 |
|
LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15 |
|
%endif |
|
|
|
FFT8 m4, m5, m6, m7, m8, m9 |
|
|
|
%if %2 |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
movaps m2, [inq + 2*mmsize] |
|
movaps m3, [inq + 3*mmsize] |
|
%else |
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12 |
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13 |
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14 |
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15 |
|
%endif |
|
|
|
movaps m8, [tab_32_float] |
|
vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 |
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13 |
|
|
|
SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ |
|
m10, m11, m12, m13, m14, m15 |
|
|
|
unpcklpd m9, m1, m3 |
|
unpcklpd m10, m5, m7 |
|
unpcklpd m8, m0, m2 |
|
unpcklpd m11, m4, m6 |
|
unpckhpd m1, m1, m3 |
|
unpckhpd m5, m5, m7 |
|
unpckhpd m0, m0, m2 |
|
unpckhpd m4, m4, m6 |
|
|
|
vextractf128 [outq + 16* 0], m8, 0 |
|
vextractf128 [outq + 16* 1], m0, 0 |
|
vextractf128 [outq + 16* 2], m8, 1 |
|
vextractf128 [outq + 16* 3], m0, 1 |
|
vextractf128 [outq + 16* 4], m9, 0 |
|
vextractf128 [outq + 16* 5], m1, 0 |
|
vextractf128 [outq + 16* 6], m9, 1 |
|
vextractf128 [outq + 16* 7], m1, 1 |
|
|
|
vextractf128 [outq + 16* 8], m11, 0 |
|
vextractf128 [outq + 16* 9], m4, 0 |
|
vextractf128 [outq + 16*10], m11, 1 |
|
vextractf128 [outq + 16*11], m4, 1 |
|
vextractf128 [outq + 16*12], m10, 0 |
|
vextractf128 [outq + 16*13], m5, 0 |
|
vextractf128 [outq + 16*14], m10, 1 |
|
vextractf128 [outq + 16*15], m5, 1 |
|
|
|
%if %2 |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %2 |
|
cglobal fft32_ns_float, 4, 5, 16, ctx, out, in, stride, tmp |
|
call mangle(ff_tx_fft32_asm_float_ %+ %1) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
%if ARCH_X86_64 |
|
FFT32_FN avx, 0 |
|
FFT32_FN avx, 1 |
|
FFT32_FN fma3, 0 |
|
FFT32_FN fma3, 1 |
|
%endif |
|
|
|
%macro FFT_SPLIT_RADIX_DEF 1-2 |
|
ALIGN 16 |
|
.%1 %+ pt: |
|
PUSH lenq |
|
mov lenq, (%1/4) |
|
|
|
add outq, (%1*4) - (%1/1) |
|
call .32pt |
|
|
|
add outq, (%1*2) - (%1/2) |
|
call .32pt |
|
|
|
POP lenq |
|
sub outq, (%1*4) + (%1*2) + (%1/2) |
|
|
|
lea rtabq, [tab_ %+ %1 %+ _float] |
|
lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] |
|
|
|
%if %0 > 1 |
|
cmp tgtq, %1 |
|
je .deinterleave |
|
|
|
mov tmpq, %1 |
|
|
|
.synth_ %+ %1: |
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0 |
|
add outq, 8*mmsize |
|
add rtabq, 4*mmsize |
|
sub itabq, 4*mmsize |
|
sub tmpq, 4*mmsize |
|
jg .synth_ %+ %1 |
|
|
|
cmp lenq, %1 |
|
jg %2 |
|
ret |
|
%endif |
|
%endmacro |
|
|
|
%macro FFT_SPLIT_RADIX_FN 2 |
|
INIT_YMM %1 |
|
%if %2 |
|
cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp |
|
%else |
|
cglobal fft_sr_float, 4, 10, 16, 272, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
%endif |
|
mov tgtq, lenq |
|
|
|
|
|
ALIGN 16 |
|
.32pt: |
|
%if %2 |
|
movaps m4, [inq + 4*mmsize] |
|
movaps m5, [inq + 5*mmsize] |
|
movaps m6, [inq + 6*mmsize] |
|
movaps m7, [inq + 7*mmsize] |
|
%else |
|
LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12 |
|
LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13 |
|
LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14 |
|
LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15 |
|
%endif |
|
|
|
FFT8 m4, m5, m6, m7, m8, m9 |
|
|
|
%if %2 |
|
movaps m0, [inq + 0*mmsize] |
|
movaps m1, [inq + 1*mmsize] |
|
movaps m2, [inq + 2*mmsize] |
|
movaps m3, [inq + 3*mmsize] |
|
%else |
|
LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12 |
|
LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13 |
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14 |
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15 |
|
%endif |
|
|
|
movaps m8, [tab_32_float] |
|
vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 |
|
|
|
FFT16 m0, m1, m2, m3, m10, m11, m12, m13 |
|
|
|
SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ |
|
m10, m11, m12, m13, m14, m15 |
|
|
|
movaps [outq + 1*mmsize], m1 |
|
movaps [outq + 3*mmsize], m3 |
|
movaps [outq + 5*mmsize], m5 |
|
movaps [outq + 7*mmsize], m7 |
|
|
|
%if %2 |
|
add inq, 8*mmsize |
|
%else |
|
add lutq, (mmsize/2)*8 |
|
%endif |
|
cmp lenq, 32 |
|
jg .64pt |
|
|
|
movaps [outq + 0*mmsize], m0 |
|
movaps [outq + 2*mmsize], m2 |
|
movaps [outq + 4*mmsize], m4 |
|
movaps [outq + 6*mmsize], m6 |
|
|
|
ret |
|
|
|
|
|
ALIGN 16 |
|
.64pt: |
|
|
|
%define tx1_e0 m4 |
|
%define tx1_e1 m5 |
|
%define tx1_o0 m6 |
|
%define tx1_o1 m7 |
|
%define tx2_e0 m8 |
|
%define tx2_e1 m9 |
|
%define tx2_o0 m10 |
|
%define tx2_o1 m11 |
|
%define tw_e m12 |
|
%define tw_o m13 |
|
%define tmp1 m14 |
|
%define tmp2 m15 |
|
|
|
SWAP m4, m1 |
|
SWAP m6, m3 |
|
|
|
%if %2 |
|
movaps tx1_e0, [inq + 0*mmsize] |
|
movaps tx1_e1, [inq + 1*mmsize] |
|
movaps tx1_o0, [inq + 2*mmsize] |
|
movaps tx1_o1, [inq + 3*mmsize] |
|
%else |
|
LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1 |
|
LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2 |
|
LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1 |
|
LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2 |
|
%endif |
|
|
|
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 |
|
|
|
%if %2 |
|
movaps tx2_e0, [inq + 4*mmsize] |
|
movaps tx2_e1, [inq + 5*mmsize] |
|
movaps tx2_o0, [inq + 6*mmsize] |
|
movaps tx2_o1, [inq + 7*mmsize] |
|
%else |
|
LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1 |
|
LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2 |
|
LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1 |
|
LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2 |
|
%endif |
|
|
|
FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o |
|
|
|
movaps tw_e, [tab_64_float] |
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 |
|
|
|
%if %2 |
|
add inq, 8*mmsize |
|
%else |
|
add lutq, (mmsize/2)*8 |
|
%endif |
|
cmp tgtq, 64 |
|
je .64pt_deint |
|
|
|
SPLIT_RADIX_COMBINE_64 |
|
|
|
cmp lenq, 64 |
|
jg .128pt |
|
ret |
|
|
|
|
|
ALIGN 16 |
|
.128pt: |
|
PUSH lenq |
|
mov lenq, 32 |
|
|
|
add outq, 16*mmsize |
|
call .32pt |
|
|
|
add outq, 8*mmsize |
|
call .32pt |
|
|
|
POP lenq |
|
sub outq, 24*mmsize |
|
|
|
lea rtabq, [tab_128_float] |
|
lea itabq, [tab_128_float + 128 - 4*7] |
|
|
|
cmp tgtq, 128 |
|
je .deinterleave |
|
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128 |
|
|
|
cmp lenq, 128 |
|
jg .256pt |
|
ret |
|
|
|
|
|
ALIGN 16 |
|
.256pt: |
|
PUSH lenq |
|
mov lenq, 64 |
|
|
|
add outq, 32*mmsize |
|
call .32pt |
|
|
|
add outq, 16*mmsize |
|
call .32pt |
|
|
|
POP lenq |
|
sub outq, 48*mmsize |
|
|
|
lea rtabq, [tab_256_float] |
|
lea itabq, [tab_256_float + 256 - 4*7] |
|
|
|
cmp tgtq, 256 |
|
je .deinterleave |
|
|
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256 |
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize |
|
|
|
cmp lenq, 256 |
|
jg .512pt |
|
ret |
|
|
|
|
|
ALIGN 16 |
|
.512pt: |
|
PUSH lenq |
|
mov lenq, 128 |
|
|
|
add outq, 64*mmsize |
|
call .32pt |
|
|
|
add outq, 32*mmsize |
|
call .32pt |
|
|
|
POP lenq |
|
sub outq, 96*mmsize |
|
|
|
lea rtabq, [tab_512_float] |
|
lea itabq, [tab_512_float + 512 - 4*7] |
|
|
|
cmp tgtq, 512 |
|
je .deinterleave |
|
|
|
mov tmpq, 4 |
|
|
|
.synth_512: |
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512 |
|
add outq, 8*mmsize |
|
add rtabq, 4*mmsize |
|
sub itabq, 4*mmsize |
|
sub tmpq, 1 |
|
jg .synth_512 |
|
|
|
cmp lenq, 512 |
|
jg .1024pt |
|
ret |
|
|
|
|
|
ALIGN 16 |
|
.1024pt: |
|
PUSH lenq |
|
mov lenq, 256 |
|
|
|
add outq, 96*mmsize |
|
call .32pt |
|
|
|
add outq, 64*mmsize |
|
call .32pt |
|
|
|
POP lenq |
|
sub outq, 192*mmsize |
|
|
|
lea rtabq, [tab_1024_float] |
|
lea itabq, [tab_1024_float + 1024 - 4*7] |
|
|
|
cmp tgtq, 1024 |
|
je .deinterleave |
|
|
|
mov tmpq, 8 |
|
|
|
.synth_1024: |
|
SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024 |
|
add outq, 8*mmsize |
|
add rtabq, 4*mmsize |
|
sub itabq, 4*mmsize |
|
sub tmpq, 1 |
|
jg .synth_1024 |
|
|
|
cmp lenq, 1024 |
|
jg .2048pt |
|
ret |
|
|
|
|
|
FFT_SPLIT_RADIX_DEF 2048, .4096pt |
|
FFT_SPLIT_RADIX_DEF 4096, .8192pt |
|
FFT_SPLIT_RADIX_DEF 8192, .16384pt |
|
FFT_SPLIT_RADIX_DEF 16384, .32768pt |
|
FFT_SPLIT_RADIX_DEF 32768, .65536pt |
|
FFT_SPLIT_RADIX_DEF 65536, .131072pt |
|
FFT_SPLIT_RADIX_DEF 131072 |
|
|
|
|
|
|
|
|
|
.deinterleave: |
|
%if %2 |
|
PUSH strideq |
|
%endif |
|
mov tgtq, lenq |
|
imul tmpq, lenq, 2 |
|
lea strideq, [4*lenq + tmpq] |
|
|
|
.synth_deinterleave: |
|
SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, strideq |
|
add outq, 8*mmsize |
|
add rtabq, 4*mmsize |
|
sub itabq, 4*mmsize |
|
sub tgtq, 4*mmsize |
|
jg .synth_deinterleave |
|
|
|
%if %2 |
|
POP strideq |
|
sub outq, tmpq |
|
neg tmpq |
|
lea inq, [inq + tmpq*4] |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
|
|
.64pt_deint: |
|
SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 |
|
SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e |
|
|
|
unpcklpd tmp1, m0, m2 |
|
unpcklpd tmp2, m1, m3 |
|
unpcklpd tw_o, tx1_e0, tx1_o0 |
|
unpcklpd tw_e, tx2_e0, tx2_o0 |
|
unpckhpd m0, m0, m2 |
|
unpckhpd m1, m1, m3 |
|
unpckhpd tx1_e0, tx1_e0, tx1_o0 |
|
unpckhpd tx2_e0, tx2_e0, tx2_o0 |
|
|
|
vextractf128 [outq + 0*mmsize + 0], tmp1, 0 |
|
vextractf128 [outq + 0*mmsize + 16], m0, 0 |
|
vextractf128 [outq + 4*mmsize + 0], tmp2, 0 |
|
vextractf128 [outq + 4*mmsize + 16], m1, 0 |
|
|
|
vextractf128 [outq + 8*mmsize + 0], tw_o, 0 |
|
vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0 |
|
vextractf128 [outq + 9*mmsize + 0], tw_o, 1 |
|
vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1 |
|
|
|
vperm2f128 tmp1, tmp1, m0, 0x31 |
|
vperm2f128 tmp2, tmp2, m1, 0x31 |
|
|
|
vextractf128 [outq + 12*mmsize + 0], tw_e, 0 |
|
vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0 |
|
vextractf128 [outq + 13*mmsize + 0], tw_e, 1 |
|
vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 |
|
|
|
movaps tw_e, [tab_64_float + mmsize] |
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 |
|
|
|
movaps m0, [outq + 1*mmsize] |
|
movaps m1, [outq + 3*mmsize] |
|
movaps m2, [outq + 5*mmsize] |
|
movaps m3, [outq + 7*mmsize] |
|
|
|
movaps [outq + 1*mmsize], tmp1 |
|
movaps [outq + 5*mmsize], tmp2 |
|
|
|
SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ |
|
tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 |
|
|
|
unpcklpd tmp1, m0, m1 |
|
unpcklpd tmp2, m2, m3 |
|
unpcklpd tw_e, tx1_e1, tx1_o1 |
|
unpcklpd tw_o, tx2_e1, tx2_o1 |
|
unpckhpd m0, m0, m1 |
|
unpckhpd m2, m2, m3 |
|
unpckhpd tx1_e1, tx1_e1, tx1_o1 |
|
unpckhpd tx2_e1, tx2_e1, tx2_o1 |
|
|
|
vextractf128 [outq + 2*mmsize + 0], tmp1, 0 |
|
vextractf128 [outq + 2*mmsize + 16], m0, 0 |
|
vextractf128 [outq + 3*mmsize + 0], tmp1, 1 |
|
vextractf128 [outq + 3*mmsize + 16], m0, 1 |
|
|
|
vextractf128 [outq + 6*mmsize + 0], tmp2, 0 |
|
vextractf128 [outq + 6*mmsize + 16], m2, 0 |
|
vextractf128 [outq + 7*mmsize + 0], tmp2, 1 |
|
vextractf128 [outq + 7*mmsize + 16], m2, 1 |
|
|
|
vextractf128 [outq + 10*mmsize + 0], tw_e, 0 |
|
vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0 |
|
vextractf128 [outq + 11*mmsize + 0], tw_e, 1 |
|
vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1 |
|
|
|
vextractf128 [outq + 14*mmsize + 0], tw_o, 0 |
|
vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0 |
|
vextractf128 [outq + 15*mmsize + 0], tw_o, 1 |
|
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 |
|
|
|
%if %2 |
|
sub inq, 16*mmsize |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %2 |
|
cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt, off |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
|
|
call mangle(ff_tx_fft_sr_asm_float_ %+ %1) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
%if ARCH_X86_64 |
|
FFT_SPLIT_RADIX_FN avx, 0 |
|
FFT_SPLIT_RADIX_FN avx, 1 |
|
FFT_SPLIT_RADIX_FN fma3, 0 |
|
FFT_SPLIT_RADIX_FN fma3, 1 |
|
%if HAVE_AVX2_EXTERNAL |
|
FFT_SPLIT_RADIX_FN avx2, 0 |
|
FFT_SPLIT_RADIX_FN avx2, 1 |
|
%endif |
|
%endif |
|
|
|
%macro FFT15_FN 2 |
|
INIT_YMM avx2 |
|
cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5 |
|
mov lutq, [ctxq + AVTXContext.map] |
|
|
|
imul stride3q, strideq, 3 |
|
imul stride5q, strideq, 5 |
|
|
|
movaps m11, [mask_mmppmmmm] |
|
movaps m10, [tab_53_float] |
|
movaps xm9, [tab_53_float + 32] |
|
vpermpd m9, m9, q1110 |
|
movaps m8, [s15_perm] |
|
|
|
%if %1 |
|
movups xm0, [inq] |
|
movddup xm5, [inq + 16] |
|
movups m2, [inq + mmsize*0 + 24] |
|
movups m3, [inq + mmsize*1 + 24] |
|
movups m4, [inq + mmsize*2 + 24] |
|
%else |
|
LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 |
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 |
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 |
|
LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 |
|
mov tmpd, [lutq + 8] |
|
movddup xm5, [inq + tmpq*8] |
|
%endif |
|
|
|
FFT15 |
|
|
|
lea tgt5q, [outq + stride5q] |
|
lea tmpq, [outq + stride5q*2] |
|
|
|
movhps [outq], xm14 |
|
movhps [outq + stride5q*1], xm15 |
|
movlps [outq + stride5q*2], xm15 |
|
|
|
vextractf128 xm3, m0, 1 |
|
vextractf128 xm4, m1, 1 |
|
vextractf128 xm5, m2, 1 |
|
|
|
movlps [outq + strideq*1], xm1 |
|
movhps [outq + strideq*2], xm2 |
|
movlps [outq + stride3q*1], xm3 |
|
movhps [outq + strideq*4], xm4 |
|
movlps [outq + stride3q*2], xm0 |
|
movlps [outq + strideq*8], xm5 |
|
movhps [outq + stride3q*4], xm0 |
|
movhps [tgt5q + strideq*2], xm1 |
|
movhps [tgt5q + strideq*4], xm3 |
|
movlps [tmpq + strideq*1], xm2 |
|
movlps [tmpq + stride3q*1], xm4 |
|
movhps [tmpq + strideq*4], xm5 |
|
|
|
RET |
|
%endmacro |
|
|
|
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL |
|
FFT15_FN 0, float |
|
FFT15_FN 1, ns_float |
|
%endif |
|
|
|
%macro IMDCT_FN 1 |
|
INIT_YMM %1 |
|
cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ |
|
t4, t5, btmp |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov expq, [ctxq + AVTXContext.exp] |
|
|
|
lea t1d, [lend - 1] |
|
imul t1d, strided |
|
|
|
mov btmpq, ctxq |
|
mov lutq, [ctxq + AVTXContext.map] |
|
|
|
cmp strideq, 4 |
|
je .stride4 |
|
|
|
shl strideq, 1 |
|
movd xm4, strided |
|
vpbroadcastd m4, xm4 |
|
movd xm5, t1d |
|
vpbroadcastd m5, xm5 |
|
|
|
mov t2q, outq |
|
pcmpeqd m15, m15 |
|
|
|
.stridex_pre: |
|
pmulld m2, m4, [lutq] |
|
movaps m0, m15 |
|
psubd m3, m5, m2 |
|
movaps m1, m15 |
|
vgatherdps m6, [inq + m2], m0 |
|
vgatherdps m7, [inq + m3], m1 |
|
|
|
movaps m8, [expq + 0*mmsize] |
|
movaps m9, [expq + 1*mmsize] |
|
|
|
unpcklps m0, m7, m6 |
|
unpckhps m1, m7, m6 |
|
|
|
vperm2f128 m2, m1, m0, 0x02 |
|
vperm2f128 m3, m1, m0, 0x13 |
|
|
|
movshdup m10, m8 |
|
movshdup m11, m9 |
|
movsldup m12, m8 |
|
movsldup m13, m9 |
|
|
|
mulps m10, m2 |
|
mulps m11, m3 |
|
|
|
shufps m10, m10, m10, q2301 |
|
shufps m11, m11, m11, q2301 |
|
|
|
fmaddsubps m10, m12, m2, m10 |
|
fmaddsubps m11, m13, m3, m11 |
|
|
|
movups [t2q + 0*mmsize], m10 |
|
movups [t2q + 1*mmsize], m11 |
|
|
|
add expq, mmsize*2 |
|
add lutq, mmsize |
|
add t2q, mmsize*2 |
|
sub lenq, mmsize/2 |
|
jg .stridex_pre |
|
jmp .transform |
|
|
|
.stride4: |
|
lea expq, [expq + lenq*4] |
|
lea lutq, [lutq + lenq*2] |
|
lea t1q, [inq + t1q] |
|
lea t1q, [t1q + strideq - mmsize] |
|
lea t2q, [lenq*2 - mmsize/2] |
|
|
|
.stride4_pre: |
|
movups m4, [inq] |
|
movups m3, [t1q] |
|
|
|
movsldup m1, m4 |
|
movshdup m0, m3 |
|
movshdup m4, m4 |
|
movsldup m3, m3 |
|
|
|
movups m2, [expq] |
|
movups m5, [expq + 2*t2q] |
|
|
|
vpermpd m0, m0, q0123 |
|
shufps m7, m2, m2, q2301 |
|
vpermpd m4, m4, q0123 |
|
shufps m8, m5, m5, q2301 |
|
|
|
mulps m1, m7 |
|
mulps m3, m8 |
|
|
|
fmaddsubps m0, m0, m2, m1 |
|
fmaddsubps m4, m4, m5, m3 |
|
|
|
vextractf128 xm3, m0, 1 |
|
vextractf128 xm6, m4, 1 |
|
|
|
|
|
movsxd strideq, dword [lutq + 0*4] |
|
movsxd lenq, dword [lutq + 1*4] |
|
movsxd t3q, dword [lutq + 2*4] |
|
movsxd t4q, dword [lutq + 3*4] |
|
|
|
movlps [outq + strideq*8], xm0 |
|
movhps [outq + lenq*8], xm0 |
|
movlps [outq + t3q*8], xm3 |
|
movhps [outq + t4q*8], xm3 |
|
|
|
movsxd strideq, dword [lutq + 0*4 + t2q] |
|
movsxd lenq, dword [lutq + 1*4 + t2q] |
|
movsxd t3q, dword [lutq + 2*4 + t2q] |
|
movsxd t4q, dword [lutq + 3*4 + t2q] |
|
|
|
movlps [outq + strideq*8], xm4 |
|
movhps [outq + lenq*8], xm4 |
|
movlps [outq + t3q*8], xm6 |
|
movhps [outq + t4q*8], xm6 |
|
|
|
add lutq, mmsize/2 |
|
add expq, mmsize |
|
add inq, mmsize |
|
sub t1q, mmsize |
|
sub t2q, mmsize |
|
jge .stride4_pre |
|
|
|
.transform: |
|
mov strideq, 2*4 |
|
mov t4q, ctxq |
|
mov t5q, [ctxq + AVTXContext.fn] |
|
mov ctxq, [ctxq + AVTXContext.sub] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
|
|
mov inq, outq |
|
call t5q |
|
|
|
mov ctxq, t4q |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov expq, [ctxq + AVTXContext.exp] |
|
lea expq, [expq + lenq*4] |
|
|
|
xor t1q, t1q |
|
lea t2q, [lenq*4 - mmsize] |
|
|
|
.post: |
|
movaps m2, [expq + t2q] |
|
movaps m3, [expq + t1q] |
|
movups m0, [outq + t2q] |
|
movups m1, [outq + t1q] |
|
|
|
movshdup m4, m2 |
|
movshdup m5, m3 |
|
movsldup m6, m2 |
|
movsldup m7, m3 |
|
|
|
shufps m2, m0, m0, q2301 |
|
shufps m3, m1, m1, q2301 |
|
|
|
mulps m6, m0 |
|
mulps m7, m1 |
|
|
|
fmaddsubps m4, m4, m2, m6 |
|
fmaddsubps m5, m5, m3, m7 |
|
|
|
vpermpd m3, m5, q0123 |
|
vpermpd m2, m4, q0123 |
|
|
|
blendps m1, m2, m5, 01010101b |
|
blendps m0, m3, m4, 01010101b |
|
|
|
movups [outq + t2q], m0 |
|
movups [outq + t1q], m1 |
|
|
|
add t1q, mmsize |
|
sub t2q, mmsize |
|
sub lenq, mmsize/2 |
|
jg .post |
|
|
|
RET |
|
%endmacro |
|
|
|
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL |
|
IMDCT_FN avx2 |
|
%endif |
|
|
|
%macro PFA_15_FN 2 |
|
INIT_YMM %1 |
|
%if %2 |
|
cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ |
|
tgt5, stride3, stride5, btmp |
|
%else |
|
cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ |
|
tgt5, stride3, stride5, btmp |
|
%endif |
|
|
|
%if %2 |
|
PUSH inq |
|
PUSH tgt5q |
|
PUSH stride3q |
|
PUSH stride5q |
|
PUSH btmpq |
|
%endif |
|
|
|
PUSH strideq |
|
|
|
mov btmpq, outq |
|
|
|
mov outq, [ctxq + AVTXContext.tmp] |
|
%if %2 == 0 |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
%endif |
|
|
|
|
|
mov tmpq, [ctxq + AVTXContext.sub] |
|
movsxd strideq, dword [tmpq + AVTXContext.len] |
|
mov mapq, [tmpq + AVTXContext.map] |
|
|
|
shl strideq, 3 |
|
imul stride3q, strideq, 3 |
|
imul stride5q, strideq, 5 |
|
|
|
movaps m11, [mask_mmppmmmm] |
|
movaps m10, [tab_53_float] |
|
movaps xm9, [tab_53_float + 32] |
|
vpermpd m9, m9, q1110 |
|
movaps m8, [s15_perm] |
|
|
|
.dim1: |
|
mov tmpd, [mapq] |
|
lea tgtq, [outq + tmpq*8] |
|
|
|
%if %2 |
|
movups xm0, [inq] |
|
movddup xm5, [inq + 16] |
|
movups m2, [inq + mmsize*0 + 24] |
|
movups m3, [inq + mmsize*1 + 24] |
|
movups m4, [inq + mmsize*2 + 24] |
|
%else |
|
LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 |
|
LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 |
|
LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 |
|
LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 |
|
mov tmpd, [lutq + 8] |
|
movddup xm5, [inq + tmpq*8] |
|
%endif |
|
|
|
FFT15 |
|
|
|
lea tgt5q, [tgtq + stride5q] |
|
lea tmpq, [tgtq + stride5q*2] |
|
|
|
movhps [tgtq], xm14 |
|
movhps [tgtq + stride5q*1], xm15 |
|
movlps [tgtq + stride5q*2], xm15 |
|
|
|
vextractf128 xm3, m0, 1 |
|
vextractf128 xm4, m1, 1 |
|
vextractf128 xm5, m2, 1 |
|
|
|
movlps [tgtq + strideq*1], xm1 |
|
movhps [tgtq + strideq*2], xm2 |
|
movlps [tgtq + stride3q*1], xm3 |
|
movhps [tgtq + strideq*4], xm4 |
|
movlps [tgtq + stride3q*2], xm0 |
|
movlps [tgtq + strideq*8], xm5 |
|
movhps [tgtq + stride3q*4], xm0 |
|
movhps [tgt5q + strideq*2], xm1 |
|
movhps [tgt5q + strideq*4], xm3 |
|
movlps [tmpq + strideq*1], xm2 |
|
movlps [tmpq + stride3q*1], xm4 |
|
movhps [tmpq + strideq*4], xm5 |
|
|
|
%if %2 |
|
add inq, mmsize*3 + 24 |
|
%else |
|
add lutq, (mmsize/2)*3 + 12 |
|
%endif |
|
add mapq, 4 |
|
sub lenq, 15 |
|
jg .dim1 |
|
|
|
|
|
mov stride5q, ctxq |
|
movsxd stride3q, dword [ctxq + AVTXContext.len] |
|
mov tgt5q, [ctxq + AVTXContext.fn] |
|
|
|
mov inq, outq |
|
mov ctxq, [ctxq + AVTXContext.sub] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
|
|
.dim2: |
|
call tgt5q |
|
lea inq, [inq + lenq*8] |
|
lea outq, [outq + lenq*8] |
|
sub stride3q, lenq |
|
jg .dim2 |
|
|
|
mov ctxq, stride5q |
|
mov lutq, [ctxq + AVTXContext.map] |
|
mov inq, [ctxq + AVTXContext.tmp] |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
|
|
lea stride3q, [lutq + lenq*4] |
|
mov stride5q, lenq |
|
mov tgt5q, btmpq |
|
POP strideq |
|
lea tmpq, [strideq + 2*strideq] |
|
|
|
.post: |
|
LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9 |
|
vextractf128 xm1, m0, 1 |
|
movlps [tgt5q], xm0 |
|
movhps [tgt5q + strideq], xm0 |
|
movlps [tgt5q + strideq*2], xm1 |
|
movhps [tgt5q + tmpq], xm1 |
|
|
|
lea tgt5q, [tgt5q + 4*strideq] |
|
add stride3q, mmsize/2 |
|
sub stride5q, mmsize/8 |
|
jg .post |
|
|
|
%if %2 |
|
mov outq, btmpq |
|
POP btmpq |
|
POP stride5q |
|
POP stride3q |
|
POP tgt5q |
|
POP inq |
|
ret |
|
%else |
|
RET |
|
%endif |
|
|
|
%if %2 |
|
cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ |
|
tgt5, stride3, stride5, btmp |
|
movsxd lenq, dword [ctxq + AVTXContext.len] |
|
mov lutq, [ctxq + AVTXContext.map] |
|
|
|
call mangle(ff_tx_fft_pfa_15xM_asm_float) |
|
RET |
|
%endif |
|
%endmacro |
|
|
|
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL |
|
PFA_15_FN avx2, 0 |
|
PFA_15_FN avx2, 1 |
|
%endif |
|
|