File size: 1,427 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
from peachpy import *
from peachpy.x86_64 import *
def fp16_alt_xmm_to_fp32_ymm(xmm_half):
ymm_half = YMMRegister()
VPERMQ(ymm_half, xmm_half.as_ymm, 0b01010000)
ymm_zero = YMMRegister()
VPXOR(ymm_zero.as_xmm, ymm_zero.as_xmm, ymm_zero.as_xmm)
ymm_word = YMMRegister()
VPUNPCKLWD(ymm_word, ymm_zero, ymm_half)
ymm_shl1_half = YMMRegister()
VPADDW(ymm_shl1_half, ymm_half, ymm_half)
ymm_shl1_nonsign = YMMRegister()
VPADDD(ymm_shl1_nonsign, ymm_word, ymm_word)
sign_mask = Constant.float32x8(-0.0)
ymm_sign = YMMRegister()
VANDPS(ymm_sign, ymm_word, sign_mask)
ymm_shr3_nonsign = YMMRegister()
VPSRLD(ymm_shr3_nonsign, ymm_shl1_nonsign, 4)
exp_offset = Constant.uint32x8(0x38000000)
ymm_norm_nonsign = YMMRegister()
VPADDD(ymm_norm_nonsign, ymm_shr3_nonsign, exp_offset)
magic_mask = Constant.uint16x16(0x3E80)
ymm_denorm_nonsign = YMMRegister()
VPUNPCKLWD(ymm_denorm_nonsign, ymm_shl1_half, magic_mask)
magic_bias = Constant.float32x8(0.25)
VSUBPS(ymm_denorm_nonsign, ymm_denorm_nonsign, magic_bias)
ymm_denorm_cutoff = YMMRegister()
VMOVDQA(ymm_denorm_cutoff, Constant.uint32x8(0x00800000))
ymm_denorm_mask = YMMRegister()
VPCMPGTD(ymm_denorm_mask, ymm_denorm_cutoff, ymm_shr3_nonsign)
ymm_nonsign = YMMRegister()
VBLENDVPS(ymm_nonsign, ymm_norm_nonsign, ymm_denorm_nonsign, ymm_denorm_mask)
ymm_float = YMMRegister()
VORPS(ymm_float, ymm_nonsign, ymm_sign)
return ymm_float
|