File size: 6,845 Bytes
8b7c501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#pragma once
#ifndef FP16_PSIMD_H
#define FP16_PSIMD_H

#if defined(__cplusplus) && (__cplusplus >= 201103L)
	#include <cstdint>
#elif !defined(__OPENCL_VERSION__)
	#include <stdint.h>
#endif

#include <psimd.h>


PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) {
	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);

	const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000));
	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);

	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
#else
	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
#endif
	const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale);

	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
	const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias);

	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
	const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff;
	return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign));
}

PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) {
	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);

	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
	const psimd_u32 sign_lo = word_lo & sign_mask;
	const psimd_u32 sign_hi = word_hi & sign_mask;
	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);

	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000));
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
	const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f);
#else
	const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000)));
#endif
	const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale);
	const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale);

	const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80));
	const psimd_u16 shl1_half = half + half;
	const psimd_f32 magic_bias = psimd_splat_f32(0.25f);
	const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias);
	const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias);

	const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000));
	const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff;
	const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff;

	psimd_f32x2 result;
	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo));
	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi));
	return result;
}

PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) {
	const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);

	const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000));
	const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4);

#if 0
	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
	const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset;
	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
	const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset);
	const psimd_s32 exp113_offset = exp112_offset | exp1_offset;
	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset)));
#else
	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
	const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset);
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
#else
	const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000)));
#endif
	return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias)));
#endif
}

PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) {
	const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half);
	const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half);

	const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000));
	const psimd_u32 sign_lo = word_lo & sign_mask;
	const psimd_u32 sign_hi = word_hi & sign_mask;
	const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4);
	const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4);

#if 1
	const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000));
	const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset;
	const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset;
	const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000));
	const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset);
	const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset);
	const psimd_s32 exp113_offset = exp1_offset | exp112_offset;
	psimd_f32x2 result;
	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset)));
	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset)));
	return result;
#else
	const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000));
	const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset);
	const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset);
	const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f);
	psimd_f32x2 result;
	result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias)));
	result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias)));
	return result;
#endif
}

#endif /* FP16_PSIMD_H */