|
from peachpy import * |
|
from peachpy.x86_64 import * |
|
|
|
|
|
def fp16_alt_xmm_to_fp32_xmm(xmm_half): |
|
xmm_zero = XMMRegister() |
|
VPXOR(xmm_zero, xmm_zero, xmm_zero) |
|
|
|
xmm_word = XMMRegister() |
|
VPUNPCKLWD(xmm_word, xmm_zero, xmm_half) |
|
|
|
xmm_shl1_half = XMMRegister() |
|
VPADDW(xmm_shl1_half, xmm_half, xmm_half) |
|
|
|
xmm_shl1_nonsign = XMMRegister() |
|
VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word) |
|
|
|
sign_mask = Constant.float32x4(-0.0) |
|
|
|
xmm_sign = XMMRegister() |
|
VANDPS(xmm_sign, xmm_word, sign_mask) |
|
|
|
xmm_shr3_nonsign = XMMRegister() |
|
VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4) |
|
|
|
exp_offset = Constant.uint32x4(0x38000000) |
|
|
|
xmm_norm_nonsign = XMMRegister() |
|
VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset) |
|
|
|
magic_mask = Constant.uint16x8(0x3E80) |
|
xmm_denorm_nonsign = XMMRegister() |
|
VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask) |
|
|
|
magic_bias = Constant.float32x4(0.25) |
|
VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias) |
|
|
|
xmm_denorm_cutoff = XMMRegister() |
|
VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000)) |
|
|
|
xmm_denorm_mask = XMMRegister() |
|
VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign) |
|
|
|
xmm_nonsign = XMMRegister() |
|
VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask) |
|
|
|
xmm_float = XMMRegister() |
|
VORPS(xmm_float, xmm_nonsign, xmm_sign) |
|
|
|
return xmm_float |
|
|