File size: 6,845 Bytes
8b7c501 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
#pragma once
#ifndef FP16_PSIMD_H
#define FP16_PSIMD_H
#if defined(__cplusplus) && (__cplusplus >= 201103L)
#include <cstdint>
#elif !defined(__OPENCL_VERSION__)
#include <stdint.h>
#endif
#include <psimd.h>
PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
#else
const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
#endif
const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);
const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);
const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
}
PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
const psimd_u32 sign_lo = word_lo & sign_mask;
const psimd_u32 sign_hi = word_hi & sign_mask;
const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
#else
const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
#endif
const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);
const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
const psimd_u16 shl1_half = half + half;
const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);
const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;
psimd_f32x2 result;
result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
return result;
}
PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);
#if 0
const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
#else
const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
#else
const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
#endif
return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
#endif
}
PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);
const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
const psimd_u32 sign_lo = word_lo & sign_mask;
const psimd_u32 sign_hi = word_hi & sign_mask;
const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);
#if 1
const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
psimd_f32x2 result;
result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
return result;
#else
const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
psimd_f32x2 result;
result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
return result;
#endif
}
#endif /* FP16_PSIMD_H */
|