from peachpy import * from peachpy.x86_64 import * def fp16_alt_xmm_to_fp32_xmm(xmm_half): xmm_zero = XMMRegister() VPXOR(xmm_zero, xmm_zero, xmm_zero) xmm_word = XMMRegister() VPUNPCKLWD(xmm_word, xmm_zero, xmm_half) xmm_shl1_half = XMMRegister() VPADDW(xmm_shl1_half, xmm_half, xmm_half) xmm_shl1_nonsign = XMMRegister() VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word) sign_mask = Constant.float32x4(-0.0) xmm_sign = XMMRegister() VANDPS(xmm_sign, xmm_word, sign_mask) xmm_shr3_nonsign = XMMRegister() VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4) exp_offset = Constant.uint32x4(0x38000000) xmm_norm_nonsign = XMMRegister() VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset) magic_mask = Constant.uint16x8(0x3E80) xmm_denorm_nonsign = XMMRegister() VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask) magic_bias = Constant.float32x4(0.25) VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias) xmm_denorm_cutoff = XMMRegister() VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000)) xmm_denorm_mask = XMMRegister() VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign) xmm_nonsign = XMMRegister() VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask) xmm_float = XMMRegister() VORPS(xmm_float, xmm_nonsign, xmm_sign) return xmm_float