Line data Source code
1 : // Branch-free implementation of half-precision (16 bit) floating point
2 : // Copyright 2006 Mike Acton <macton@gmail.com>
3 : //
4 : // Permission is hereby granted, free of charge, to any person obtaining a
5 : // copy of this software and associated documentation files (the "Software"),
6 : // to deal in the Software without restriction, including without limitation
7 : // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 : // and/or sell copies of the Software, and to permit persons to whom the
9 : // Software is furnished to do so, subject to the following conditions:
10 : //
11 : // The above copyright notice and this permission notice shall be included
12 : // in all copies or substantial portions of the Software.
13 : //
14 : // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 : // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 : // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 : // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 : // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 : // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 : // THE SOFTWARE
21 : //
22 : // Half-precision floating point format
23 : // ------------------------------------
24 : //
25 : // | Field | Last | First | Note
26 : // |----------|------|-------|----------
27 : // | Sign | 15 | 15 |
28 : // | Exponent | 14 | 10 | Bias = 15
29 : // | Mantissa | 9 | 0 |
30 : //
31 : // Compiling
32 : // ---------
33 : //
34 : // Preferred compile flags for GCC:
35 : // -O3 -fstrict-aliasing -std=c99 -pedantic -Wall -Wstrict-aliasing
36 : //
37 : // This file is a C99 source file, intended to be compiled with a C99
38 : // compliant compiler. However, for the moment it remains combatible
39 : // with C++98. Therefore if you are using a compiler that poorly implements
40 : // C standards (e.g. MSVC), it may be compiled as C++. This is not
41 : // guaranteed for future versions.
42 : //
43 :
44 : #include "half.h"
45 :
46 : // Load immediate
47 27648000 : static inline uint32_t _uint32_li(uint32_t a)
48 : {
49 27648000 : return (a);
50 : }
51 :
52 : // Decrement
53 2304000 : static inline uint32_t _uint32_dec(uint32_t a)
54 : {
55 2304000 : return (a - 1);
56 : }
57 :
58 : // Increment
59 0 : static inline uint32_t _uint32_inc(uint32_t a)
60 : {
61 0 : return (a + 1);
62 : }
63 :
64 : // Complement
65 0 : static inline uint32_t _uint32_not(uint32_t a)
66 : {
67 0 : return (~a);
68 : }
69 :
70 : // Negate
71 4608000 : static inline uint32_t _uint32_neg(uint32_t a)
72 : {
73 : #pragma warning(disable : 4146)
74 4608000 : return (-a);
75 : #pragma warning(default : 4146)
76 : }
77 :
78 : // Extend sign
79 11520000 : static inline uint32_t _uint32_ext(uint32_t a)
80 : {
81 11520000 : return (((int32_t)a) >> 31);
82 : }
83 :
84 : // And
85 23040000 : static inline uint32_t _uint32_and(uint32_t a, uint32_t b)
86 : {
87 23040000 : return (a & b);
88 : }
89 :
90 : // Exclusive Or
91 0 : static inline uint32_t _uint32_xor(uint32_t a, uint32_t b)
92 : {
93 0 : return (a ^ b);
94 : }
95 :
96 : // And with Complement
97 16128000 : static inline uint32_t _uint32_andc(uint32_t a, uint32_t b)
98 : {
99 16128000 : return (a & ~b);
100 : }
101 :
102 : // Or
103 18432000 : static inline uint32_t _uint32_or(uint32_t a, uint32_t b)
104 : {
105 18432000 : return (a | b);
106 : }
107 :
108 : // Shift Right Logical
109 0 : static inline uint32_t _uint32_srl(uint32_t a, int sa)
110 : {
111 0 : return (a >> sa);
112 : }
113 :
114 : // Shift Left Logical
115 11520000 : static inline uint32_t _uint32_sll(uint32_t a, int sa)
116 : {
117 11520000 : return (a << sa);
118 : }
119 :
120 : // Add
121 2304000 : static inline uint32_t _uint32_add(uint32_t a, uint32_t b)
122 : {
123 2304000 : return (a + b);
124 : }
125 :
126 : // Subtract
127 6912000 : static inline uint32_t _uint32_sub(uint32_t a, uint32_t b)
128 : {
129 6912000 : return (a - b);
130 : }
131 :
132 : // Multiply
133 0 : static inline uint32_t _uint32_mul(uint32_t a, uint32_t b)
134 : {
135 0 : return (a * b);
136 : }
137 :
138 : // Select on Sign bit
139 9216000 : static inline uint32_t _uint32_sels(uint32_t test, uint32_t a, uint32_t b)
140 : {
141 9216000 : const uint32_t mask = _uint32_ext(test);
142 9216000 : const uint32_t sel_a = _uint32_and(a, mask);
143 9216000 : const uint32_t sel_b = _uint32_andc(b, mask);
144 9216000 : const uint32_t result = _uint32_or(sel_a, sel_b);
145 :
146 9216000 : return (result);
147 : }
148 :
149 : // Select Bits on mask
150 0 : static inline uint32_t _uint32_selb(uint32_t mask, uint32_t a, uint32_t b)
151 : {
152 0 : const uint32_t sel_a = _uint32_and(a, mask);
153 0 : const uint32_t sel_b = _uint32_andc(b, mask);
154 0 : const uint32_t result = _uint32_or(sel_a, sel_b);
155 :
156 0 : return (result);
157 : }
158 :
159 : // Load Immediate
160 0 : static inline uint16_t _uint16_li(uint16_t a)
161 : {
162 0 : return (a);
163 : }
164 :
165 : // Extend sign
166 0 : static inline uint16_t _uint16_ext(uint16_t a)
167 : {
168 0 : return (((int16_t)a) >> 15);
169 : }
170 :
171 : // Negate
172 0 : static inline uint16_t _uint16_neg(uint16_t a)
173 : {
174 0 : return (-a);
175 : }
176 :
177 : #ifndef __GNUC__
178 : // Complement
179 : static inline uint16_t _uint16_not(uint16_t a)
180 : {
181 : return (~a);
182 : }
183 : // Add then Mask
184 : static inline uint16_t _uint16_addm(uint16_t a, uint16_t b, uint16_t mask)
185 : {
186 : return ((a + b) & mask);
187 : }
188 : #endif
189 :
190 : // Decrement
191 0 : static inline uint16_t _uint16_dec(uint16_t a)
192 : {
193 0 : return (a - 1);
194 : }
195 :
196 : // Shift Left Logical
197 0 : static inline uint16_t _uint16_sll(uint16_t a, int sa)
198 : {
199 0 : return (a << sa);
200 : }
201 :
202 : // Shift Right Logical
203 0 : static inline uint16_t _uint16_srl(uint16_t a, int sa)
204 : {
205 0 : return (a >> sa);
206 : }
207 :
208 : // Add
209 0 : static inline uint16_t _uint16_add(uint16_t a, uint16_t b)
210 : {
211 0 : return (a + b);
212 : }
213 :
214 : // Subtract
215 0 : static inline uint16_t _uint16_sub(uint16_t a, uint16_t b)
216 : {
217 0 : return (a - b);
218 : }
219 :
220 : // And
221 0 : static inline uint16_t _uint16_and(uint16_t a, uint16_t b)
222 : {
223 0 : return (a & b);
224 : }
225 :
226 : // Or
227 0 : static inline uint16_t _uint16_or(uint16_t a, uint16_t b)
228 : {
229 0 : return (a | b);
230 : }
231 :
232 : // Exclusive Or
233 0 : static inline uint16_t _uint16_xor(uint16_t a, uint16_t b)
234 : {
235 0 : return (a ^ b);
236 : }
237 :
238 : // And with Complement
239 0 : static inline uint16_t _uint16_andc(uint16_t a, uint16_t b)
240 : {
241 0 : return (a & ~b);
242 : }
243 :
244 : // And then Shift Right Logical
245 0 : static inline uint16_t _uint16_andsrl(uint16_t a, uint16_t b, int sa)
246 : {
247 0 : return ((a & b) >> sa);
248 : }
249 :
250 : // Shift Right Logical then Mask
251 0 : static inline uint16_t _uint16_srlm(uint16_t a, int sa, uint16_t mask)
252 : {
253 0 : return ((a >> sa) & mask);
254 : }
255 :
256 : // Select on Sign bit
257 0 : static inline uint16_t _uint16_sels(uint16_t test, uint16_t a, uint16_t b)
258 : {
259 0 : const uint16_t mask = _uint16_ext(test);
260 0 : const uint16_t sel_a = _uint16_and(a, mask);
261 0 : const uint16_t sel_b = _uint16_andc(b, mask);
262 0 : const uint16_t result = _uint16_or(sel_a, sel_b);
263 :
264 0 : return (result);
265 : }
266 :
267 : // Count Leading Zeros
268 2304000 : static inline uint32_t _uint32_cntlz(uint32_t x)
269 : {
270 : #ifdef __GNUC__
271 : /* NOTE: __builtin_clz is undefined for x == 0 */
272 : /* On PowerPC, this will map to insn: cntlzw */
273 : /* On Pentium, this will map to insn: clz */
274 2304000 : uint32_t is_x_nez_msb = _uint32_neg(x);
275 2304000 : uint32_t nlz = __builtin_clz(x);
276 2304000 : uint32_t result = _uint32_sels(is_x_nez_msb, nlz, 0x00000020);
277 2304000 : return (result);
278 : #else
279 : const uint32_t x0 = _uint32_srl(x, 1);
280 : const uint32_t x1 = _uint32_or(x, x0);
281 : const uint32_t x2 = _uint32_srl(x1, 2);
282 : const uint32_t x3 = _uint32_or(x1, x2);
283 : const uint32_t x4 = _uint32_srl(x3, 4);
284 : const uint32_t x5 = _uint32_or(x3, x4);
285 : const uint32_t x6 = _uint32_srl(x5, 8);
286 : const uint32_t x7 = _uint32_or(x5, x6);
287 : const uint32_t x8 = _uint32_srl(x7, 16);
288 : const uint32_t x9 = _uint32_or(x7, x8);
289 : const uint32_t xA = _uint32_not(x9);
290 : const uint32_t xB = _uint32_srl(xA, 1);
291 : const uint32_t xC = _uint32_and(xB, 0x55555555);
292 : const uint32_t xD = _uint32_sub(xA, xC);
293 : const uint32_t xE = _uint32_and(xD, 0x33333333);
294 : const uint32_t xF = _uint32_srl(xD, 2);
295 : const uint32_t x10 = _uint32_and(xF, 0x33333333);
296 : const uint32_t x11 = _uint32_add(xE, x10);
297 : const uint32_t x12 = _uint32_srl(x11, 4);
298 : const uint32_t x13 = _uint32_add(x11, x12);
299 : const uint32_t x14 = _uint32_and(x13, 0x0f0f0f0f);
300 : const uint32_t x15 = _uint32_srl(x14, 8);
301 : const uint32_t x16 = _uint32_add(x14, x15);
302 : const uint32_t x17 = _uint32_srl(x16, 16);
303 : const uint32_t x18 = _uint32_add(x16, x17);
304 : const uint32_t x19 = _uint32_and(x18, 0x0000003f);
305 : return (x19);
306 : #endif
307 : }
308 :
309 : // Count Leading Zeros
310 0 : static inline uint16_t _uint16_cntlz(uint16_t x)
311 : {
312 : #ifdef __GNUC__
313 0 : uint16_t nlz32 = (uint16_t)_uint32_cntlz((uint32_t)x);
314 0 : uint32_t nlz = _uint32_sub(nlz32, 16);
315 0 : return (nlz);
316 : #else
317 : const uint16_t x0 = _uint16_srl(x, 1);
318 : const uint16_t x1 = _uint16_or(x, x0);
319 : const uint16_t x2 = _uint16_srl(x1, 2);
320 : const uint16_t x3 = _uint16_or(x1, x2);
321 : const uint16_t x4 = _uint16_srl(x3, 4);
322 : const uint16_t x5 = _uint16_or(x3, x4);
323 : const uint16_t x6 = _uint16_srl(x5, 8);
324 : const uint16_t x7 = _uint16_or(x5, x6);
325 : const uint16_t x8 = _uint16_not(x7);
326 : const uint16_t x9 = _uint16_srlm(x8, 1, 0x5555);
327 : const uint16_t xA = _uint16_sub(x8, x9);
328 : const uint16_t xB = _uint16_and(xA, 0x3333);
329 : const uint16_t xC = _uint16_srlm(xA, 2, 0x3333);
330 : const uint16_t xD = _uint16_add(xB, xC);
331 : const uint16_t xE = _uint16_srl(xD, 4);
332 : const uint16_t xF = _uint16_addm(xD, xE, 0x0f0f);
333 : const uint16_t x10 = _uint16_srl(xF, 8);
334 : const uint16_t x11 = _uint16_addm(xF, x10, 0x001f);
335 : return (x11);
336 : #endif
337 : }
338 :
339 0 : uint16_t half_from_float(float f_)
340 : {
341 0 : const uint32_t f = *reinterpret_cast<uint32_t*>(&f_);
342 0 : const uint32_t one = _uint32_li(0x00000001);
343 0 : const uint32_t f_s_mask = _uint32_li(0x80000000);
344 0 : const uint32_t f_e_mask = _uint32_li(0x7f800000);
345 0 : const uint32_t f_m_mask = _uint32_li(0x007fffff);
346 0 : const uint32_t f_m_hidden_bit = _uint32_li(0x00800000);
347 0 : const uint32_t f_m_round_bit = _uint32_li(0x00001000);
348 0 : const uint32_t f_snan_mask = _uint32_li(0x7fc00000);
349 0 : const uint32_t f_e_pos = _uint32_li(0x00000017);
350 0 : const uint32_t h_e_pos = _uint32_li(0x0000000a);
351 0 : const uint32_t h_e_mask = _uint32_li(0x00007c00);
352 0 : const uint32_t h_snan_mask = _uint32_li(0x00007e00);
353 0 : const uint32_t h_e_mask_value = _uint32_li(0x0000001f);
354 0 : const uint32_t f_h_s_pos_offset = _uint32_li(0x00000010);
355 0 : const uint32_t f_h_bias_offset = _uint32_li(0x00000070);
356 0 : const uint32_t f_h_m_pos_offset = _uint32_li(0x0000000d);
357 0 : const uint32_t h_nan_min = _uint32_li(0x00007c01);
358 0 : const uint32_t f_h_e_biased_flag = _uint32_li(0x0000008f);
359 0 : const uint32_t f_s = _uint32_and(f, f_s_mask);
360 0 : const uint32_t f_e = _uint32_and(f, f_e_mask);
361 0 : const uint16_t h_s = _uint32_srl(f_s, f_h_s_pos_offset);
362 0 : const uint32_t f_m = _uint32_and(f, f_m_mask);
363 0 : const uint16_t f_e_amount = _uint32_srl(f_e, f_e_pos);
364 0 : const uint32_t f_e_half_bias = _uint32_sub(f_e_amount, f_h_bias_offset);
365 0 : const uint32_t f_snan = _uint32_and(f, f_snan_mask);
366 0 : const uint32_t f_m_round_mask = _uint32_and(f_m, f_m_round_bit);
367 0 : const uint32_t f_m_round_offset = _uint32_sll(f_m_round_mask, one);
368 0 : const uint32_t f_m_rounded = _uint32_add(f_m, f_m_round_offset);
369 0 : const uint32_t f_m_denorm_sa = _uint32_sub(one, f_e_half_bias);
370 0 : const uint32_t f_m_with_hidden = _uint32_or(f_m_rounded, f_m_hidden_bit);
371 0 : const uint32_t f_m_denorm = _uint32_srl(f_m_with_hidden, f_m_denorm_sa);
372 0 : const uint32_t h_m_denorm = _uint32_srl(f_m_denorm, f_h_m_pos_offset);
373 : const uint32_t f_m_rounded_overflow =
374 0 : _uint32_and(f_m_rounded, f_m_hidden_bit);
375 0 : const uint32_t m_nan = _uint32_srl(f_m, f_h_m_pos_offset);
376 0 : const uint32_t h_em_nan = _uint32_or(h_e_mask, m_nan);
377 0 : const uint32_t h_e_norm_overflow_offset = _uint32_inc(f_e_half_bias);
378 : const uint32_t h_e_norm_overflow =
379 0 : _uint32_sll(h_e_norm_overflow_offset, h_e_pos);
380 0 : const uint32_t h_e_norm = _uint32_sll(f_e_half_bias, h_e_pos);
381 0 : const uint32_t h_m_norm = _uint32_srl(f_m_rounded, f_h_m_pos_offset);
382 0 : const uint32_t h_em_norm = _uint32_or(h_e_norm, h_m_norm);
383 0 : const uint32_t is_h_ndenorm_msb = _uint32_sub(f_h_bias_offset, f_e_amount);
384 : const uint32_t is_f_e_flagged_msb =
385 0 : _uint32_sub(f_h_e_biased_flag, f_e_half_bias);
386 0 : const uint32_t is_h_denorm_msb = _uint32_not(is_h_ndenorm_msb);
387 0 : const uint32_t is_f_m_eqz_msb = _uint32_dec(f_m);
388 0 : const uint32_t is_h_nan_eqz_msb = _uint32_dec(m_nan);
389 : const uint32_t is_f_inf_msb =
390 0 : _uint32_and(is_f_e_flagged_msb, is_f_m_eqz_msb);
391 : const uint32_t is_f_nan_underflow_msb =
392 0 : _uint32_and(is_f_e_flagged_msb, is_h_nan_eqz_msb);
393 : const uint32_t is_e_overflow_msb =
394 0 : _uint32_sub(h_e_mask_value, f_e_half_bias);
395 0 : const uint32_t is_h_inf_msb = _uint32_or(is_e_overflow_msb, is_f_inf_msb);
396 0 : const uint32_t is_f_nsnan_msb = _uint32_sub(f_snan, f_snan_mask);
397 0 : const uint32_t is_m_norm_overflow_msb = _uint32_neg(f_m_rounded_overflow);
398 0 : const uint32_t is_f_snan_msb = _uint32_not(is_f_nsnan_msb);
399 : const uint32_t h_em_overflow_result =
400 0 : _uint32_sels(is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm);
401 : const uint32_t h_em_nan_result =
402 0 : _uint32_sels(is_f_e_flagged_msb, h_em_nan, h_em_overflow_result);
403 : const uint32_t h_em_nan_underflow_result =
404 0 : _uint32_sels(is_f_nan_underflow_msb, h_nan_min, h_em_nan_result);
405 : const uint32_t h_em_inf_result =
406 0 : _uint32_sels(is_h_inf_msb, h_e_mask, h_em_nan_underflow_result);
407 : const uint32_t h_em_denorm_result =
408 0 : _uint32_sels(is_h_denorm_msb, h_m_denorm, h_em_inf_result);
409 : const uint32_t h_em_snan_result =
410 0 : _uint32_sels(is_f_snan_msb, h_snan_mask, h_em_denorm_result);
411 0 : const uint32_t h_result = _uint32_or(h_s, h_em_snan_result);
412 :
413 0 : return (uint16_t)(h_result);
414 : }
415 :
416 2304000 : float half_to_float(uint16_t h)
417 : {
418 2304000 : const uint32_t h_e_mask = _uint32_li(0x00007c00);
419 2304000 : const uint32_t h_m_mask = _uint32_li(0x000003ff);
420 2304000 : const uint32_t h_s_mask = _uint32_li(0x00008000);
421 2304000 : const uint32_t h_f_s_pos_offset = _uint32_li(0x00000010);
422 2304000 : const uint32_t h_f_e_pos_offset = _uint32_li(0x0000000d);
423 2304000 : const uint32_t h_f_bias_offset = _uint32_li(0x0001c000);
424 2304000 : const uint32_t f_e_mask = _uint32_li(0x7f800000);
425 2304000 : const uint32_t f_m_mask = _uint32_li(0x007fffff);
426 2304000 : const uint32_t h_f_e_denorm_bias = _uint32_li(0x0000007e);
427 2304000 : const uint32_t h_f_m_denorm_sa_bias = _uint32_li(0x00000008);
428 2304000 : const uint32_t f_e_pos = _uint32_li(0x00000017);
429 2304000 : const uint32_t h_e_mask_minus_one = _uint32_li(0x00007bff);
430 2304000 : const uint32_t h_e = _uint32_and(h, h_e_mask);
431 2304000 : const uint32_t h_m = _uint32_and(h, h_m_mask);
432 2304000 : const uint32_t h_s = _uint32_and(h, h_s_mask);
433 2304000 : const uint32_t h_e_f_bias = _uint32_add(h_e, h_f_bias_offset);
434 2304000 : const uint32_t h_m_nlz = _uint32_cntlz(h_m);
435 2304000 : const uint32_t f_s = _uint32_sll(h_s, h_f_s_pos_offset);
436 2304000 : const uint32_t f_e = _uint32_sll(h_e_f_bias, h_f_e_pos_offset);
437 2304000 : const uint32_t f_m = _uint32_sll(h_m, h_f_e_pos_offset);
438 2304000 : const uint32_t f_em = _uint32_or(f_e, f_m);
439 2304000 : const uint32_t h_f_m_sa = _uint32_sub(h_m_nlz, h_f_m_denorm_sa_bias);
440 : const uint32_t f_e_denorm_unpacked =
441 2304000 : _uint32_sub(h_f_e_denorm_bias, h_f_m_sa);
442 2304000 : const uint32_t h_f_m = _uint32_sll(h_m, h_f_m_sa);
443 2304000 : const uint32_t f_m_denorm = _uint32_and(h_f_m, f_m_mask);
444 2304000 : const uint32_t f_e_denorm = _uint32_sll(f_e_denorm_unpacked, f_e_pos);
445 2304000 : const uint32_t f_em_denorm = _uint32_or(f_e_denorm, f_m_denorm);
446 2304000 : const uint32_t f_em_nan = _uint32_or(f_e_mask, f_m);
447 2304000 : const uint32_t is_e_eqz_msb = _uint32_dec(h_e);
448 2304000 : const uint32_t is_m_nez_msb = _uint32_neg(h_m);
449 2304000 : const uint32_t is_e_flagged_msb = _uint32_sub(h_e_mask_minus_one, h_e);
450 2304000 : const uint32_t is_zero_msb = _uint32_andc(is_e_eqz_msb, is_m_nez_msb);
451 2304000 : const uint32_t is_inf_msb = _uint32_andc(is_e_flagged_msb, is_m_nez_msb);
452 2304000 : const uint32_t is_denorm_msb = _uint32_and(is_m_nez_msb, is_e_eqz_msb);
453 2304000 : const uint32_t is_nan_msb = _uint32_and(is_e_flagged_msb, is_m_nez_msb);
454 2304000 : const uint32_t is_zero = _uint32_ext(is_zero_msb);
455 2304000 : const uint32_t f_zero_result = _uint32_andc(f_em, is_zero);
456 : const uint32_t f_denorm_result =
457 2304000 : _uint32_sels(is_denorm_msb, f_em_denorm, f_zero_result);
458 : const uint32_t f_inf_result =
459 2304000 : _uint32_sels(is_inf_msb, f_e_mask, f_denorm_result);
460 : const uint32_t f_nan_result =
461 2304000 : _uint32_sels(is_nan_msb, f_em_nan, f_inf_result);
462 2304000 : const uint32_t f_result = _uint32_or(f_s, f_nan_result);
463 :
464 2304000 : return *reinterpret_cast<const float*>(&f_result);
465 : }
466 :
467 : // half_add
468 : // --------
469 : //
470 : // (SUM) uint16_t z = half_add( x, y );
471 : // (DIFFERENCE) uint16_t z = half_add( x, -y );
472 : //
473 : // * Difference of ZEROs is always +ZERO
474 : // * Sum round with guard + round + sticky bit (grs)
475 : // * QNaN + <x> = QNaN
476 : // * <x> + +INF = +INF
477 : // * <x> - -INF = -INF
478 : // * INF - INF = SNaN
479 : //
480 : // Will have exactly (0 ulps difference) the same result as:
481 : // (Round up)
482 : //
483 : // union FLOAT_32
484 : // {
485 : // float f32;
486 : // uint32_t u32;
487 : // };
488 : //
489 : // union FLOAT_32 fx = { .u32 = half_to_float( x ) };
490 : // union FLOAT_32 fy = { .u32 = half_to_float( y ) };
491 : // union FLOAT_32 fz = { .f32 = fx.f32 + fy.f32 };
492 : // uint16_t z = float_to_half( fz );
493 : //
494 0 : uint16_t half_add(uint16_t x, uint16_t y)
495 : {
496 0 : const uint16_t one = _uint16_li(0x0001);
497 0 : const uint16_t msb_to_lsb_sa = _uint16_li(0x000f);
498 0 : const uint16_t h_s_mask = _uint16_li(0x8000);
499 0 : const uint16_t h_e_mask = _uint16_li(0x7c00);
500 0 : const uint16_t h_m_mask = _uint16_li(0x03ff);
501 0 : const uint16_t h_m_msb_mask = _uint16_li(0x2000);
502 0 : const uint16_t h_m_msb_sa = _uint16_li(0x000d);
503 0 : const uint16_t h_m_hidden = _uint16_li(0x0400);
504 0 : const uint16_t h_e_pos = _uint16_li(0x000a);
505 0 : const uint16_t h_e_bias_minus_one = _uint16_li(0x000e);
506 0 : const uint16_t h_m_grs_carry = _uint16_li(0x4000);
507 0 : const uint16_t h_m_grs_carry_pos = _uint16_li(0x000e);
508 0 : const uint16_t h_grs_size = _uint16_li(0x0003);
509 0 : const uint16_t h_snan = _uint16_li(0xfe00);
510 0 : const uint16_t h_e_mask_minus_one = _uint16_li(0x7bff);
511 0 : const uint16_t h_grs_round_carry = _uint16_sll(one, h_grs_size);
512 0 : const uint16_t h_grs_round_mask = _uint16_sub(h_grs_round_carry, one);
513 0 : const uint16_t x_e = _uint16_and(x, h_e_mask);
514 0 : const uint16_t y_e = _uint16_and(y, h_e_mask);
515 0 : const uint16_t is_y_e_larger_msb = _uint16_sub(x_e, y_e);
516 0 : const uint16_t a = _uint16_sels(is_y_e_larger_msb, y, x);
517 0 : const uint16_t a_s = _uint16_and(a, h_s_mask);
518 0 : const uint16_t a_e = _uint16_and(a, h_e_mask);
519 0 : const uint16_t a_m_no_hidden_bit = _uint16_and(a, h_m_mask);
520 0 : const uint16_t a_em_no_hidden_bit = _uint16_or(a_e, a_m_no_hidden_bit);
521 0 : const uint16_t b = _uint16_sels(is_y_e_larger_msb, x, y);
522 0 : const uint16_t b_s = _uint16_and(b, h_s_mask);
523 0 : const uint16_t b_e = _uint16_and(b, h_e_mask);
524 0 : const uint16_t b_m_no_hidden_bit = _uint16_and(b, h_m_mask);
525 0 : const uint16_t b_em_no_hidden_bit = _uint16_or(b_e, b_m_no_hidden_bit);
526 0 : const uint16_t is_diff_sign_msb = _uint16_xor(a_s, b_s);
527 : const uint16_t is_a_inf_msb =
528 0 : _uint16_sub(h_e_mask_minus_one, a_em_no_hidden_bit);
529 : const uint16_t is_b_inf_msb =
530 0 : _uint16_sub(h_e_mask_minus_one, b_em_no_hidden_bit);
531 0 : const uint16_t is_undenorm_msb = _uint16_dec(a_e);
532 0 : const uint16_t is_undenorm = _uint16_ext(is_undenorm_msb);
533 0 : const uint16_t is_both_inf_msb = _uint16_and(is_a_inf_msb, is_b_inf_msb);
534 0 : const uint16_t is_invalid_inf_op_msb = _uint16_and(is_both_inf_msb, b_s);
535 0 : const uint16_t is_a_e_nez_msb = _uint16_neg(a_e);
536 0 : const uint16_t is_b_e_nez_msb = _uint16_neg(b_e);
537 0 : const uint16_t is_a_e_nez = _uint16_ext(is_a_e_nez_msb);
538 0 : const uint16_t is_b_e_nez = _uint16_ext(is_b_e_nez_msb);
539 0 : const uint16_t a_m_hidden_bit = _uint16_and(is_a_e_nez, h_m_hidden);
540 0 : const uint16_t b_m_hidden_bit = _uint16_and(is_b_e_nez, h_m_hidden);
541 0 : const uint16_t a_m_no_grs = _uint16_or(a_m_no_hidden_bit, a_m_hidden_bit);
542 0 : const uint16_t b_m_no_grs = _uint16_or(b_m_no_hidden_bit, b_m_hidden_bit);
543 0 : const uint16_t diff_e = _uint16_sub(a_e, b_e);
544 0 : const uint16_t a_e_unbias = _uint16_sub(a_e, h_e_bias_minus_one);
545 0 : const uint16_t a_m = _uint16_sll(a_m_no_grs, h_grs_size);
546 0 : const uint16_t a_e_biased = _uint16_srl(a_e, h_e_pos);
547 0 : const uint16_t m_sa_unbias = _uint16_srl(a_e_unbias, h_e_pos);
548 0 : const uint16_t m_sa_default = _uint16_srl(diff_e, h_e_pos);
549 : const uint16_t m_sa_unbias_mask =
550 0 : _uint16_andc(is_a_e_nez_msb, is_b_e_nez_msb);
551 : const uint16_t m_sa =
552 0 : _uint16_sels(m_sa_unbias_mask, m_sa_unbias, m_sa_default);
553 0 : const uint16_t b_m_no_sticky = _uint16_sll(b_m_no_grs, h_grs_size);
554 0 : const uint16_t sh_m = _uint16_srl(b_m_no_sticky, m_sa);
555 0 : const uint16_t sticky_overflow = _uint16_sll(one, m_sa);
556 0 : const uint16_t sticky_mask = _uint16_dec(sticky_overflow);
557 0 : const uint16_t sticky_collect = _uint16_and(b_m_no_sticky, sticky_mask);
558 0 : const uint16_t is_sticky_set_msb = _uint16_neg(sticky_collect);
559 0 : const uint16_t sticky = _uint16_srl(is_sticky_set_msb, msb_to_lsb_sa);
560 0 : const uint16_t b_m = _uint16_or(sh_m, sticky);
561 0 : const uint16_t is_c_m_ab_pos_msb = _uint16_sub(b_m, a_m);
562 0 : const uint16_t c_inf = _uint16_or(a_s, h_e_mask);
563 0 : const uint16_t c_m_sum = _uint16_add(a_m, b_m);
564 0 : const uint16_t c_m_diff_ab = _uint16_sub(a_m, b_m);
565 0 : const uint16_t c_m_diff_ba = _uint16_sub(b_m, a_m);
566 : const uint16_t c_m_smag_diff =
567 0 : _uint16_sels(is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba);
568 0 : const uint16_t c_s_diff = _uint16_sels(is_c_m_ab_pos_msb, a_s, b_s);
569 0 : const uint16_t c_s = _uint16_sels(is_diff_sign_msb, c_s_diff, a_s);
570 0 : const uint16_t c_m_smag_diff_nlz = _uint16_cntlz(c_m_smag_diff);
571 0 : const uint16_t diff_norm_sa = _uint16_sub(c_m_smag_diff_nlz, one);
572 0 : const uint16_t is_diff_denorm_msb = _uint16_sub(a_e_biased, diff_norm_sa);
573 0 : const uint16_t is_diff_denorm = _uint16_ext(is_diff_denorm_msb);
574 0 : const uint16_t is_a_or_b_norm_msb = _uint16_neg(a_e_biased);
575 0 : const uint16_t diff_denorm_sa = _uint16_dec(a_e_biased);
576 0 : const uint16_t c_m_diff_denorm = _uint16_sll(c_m_smag_diff, diff_denorm_sa);
577 0 : const uint16_t c_m_diff_norm = _uint16_sll(c_m_smag_diff, diff_norm_sa);
578 0 : const uint16_t c_e_diff_norm = _uint16_sub(a_e_biased, diff_norm_sa);
579 : const uint16_t c_m_diff_ab_norm =
580 0 : _uint16_sels(is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm);
581 : const uint16_t c_e_diff_ab_norm =
582 0 : _uint16_andc(c_e_diff_norm, is_diff_denorm);
583 : const uint16_t c_m_diff =
584 0 : _uint16_sels(is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff);
585 : const uint16_t c_e_diff =
586 0 : _uint16_sels(is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased);
587 0 : const uint16_t is_diff_eqz_msb = _uint16_dec(c_m_diff);
588 : const uint16_t is_diff_exactly_zero_msb =
589 0 : _uint16_and(is_diff_sign_msb, is_diff_eqz_msb);
590 0 : const uint16_t is_diff_exactly_zero = _uint16_ext(is_diff_exactly_zero_msb);
591 : const uint16_t c_m_added =
592 0 : _uint16_sels(is_diff_sign_msb, c_m_diff, c_m_sum);
593 : const uint16_t c_e_added =
594 0 : _uint16_sels(is_diff_sign_msb, c_e_diff, a_e_biased);
595 0 : const uint16_t c_m_carry = _uint16_and(c_m_added, h_m_grs_carry);
596 0 : const uint16_t is_c_m_carry_msb = _uint16_neg(c_m_carry);
597 : const uint16_t c_e_hidden_offset =
598 0 : _uint16_andsrl(c_m_added, h_m_grs_carry, h_m_grs_carry_pos);
599 0 : const uint16_t c_m_sub_hidden = _uint16_srl(c_m_added, one);
600 : const uint16_t c_m_no_hidden =
601 0 : _uint16_sels(is_c_m_carry_msb, c_m_sub_hidden, c_m_added);
602 0 : const uint16_t c_e_no_hidden = _uint16_add(c_e_added, c_e_hidden_offset);
603 0 : const uint16_t c_m_no_hidden_msb = _uint16_and(c_m_no_hidden, h_m_msb_mask);
604 : const uint16_t undenorm_m_msb_odd =
605 0 : _uint16_srl(c_m_no_hidden_msb, h_m_msb_sa);
606 : const uint16_t undenorm_fix_e =
607 0 : _uint16_and(is_undenorm, undenorm_m_msb_odd);
608 0 : const uint16_t c_e_fixed = _uint16_add(c_e_no_hidden, undenorm_fix_e);
609 : const uint16_t c_m_round_amount =
610 0 : _uint16_and(c_m_no_hidden, h_grs_round_mask);
611 0 : const uint16_t c_m_rounded = _uint16_add(c_m_no_hidden, c_m_round_amount);
612 : const uint16_t c_m_round_overflow =
613 0 : _uint16_andsrl(c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos);
614 0 : const uint16_t c_e_rounded = _uint16_add(c_e_fixed, c_m_round_overflow);
615 0 : const uint16_t c_m_no_grs = _uint16_srlm(c_m_rounded, h_grs_size, h_m_mask);
616 0 : const uint16_t c_e = _uint16_sll(c_e_rounded, h_e_pos);
617 0 : const uint16_t c_em = _uint16_or(c_e, c_m_no_grs);
618 0 : const uint16_t c_normal = _uint16_or(c_s, c_em);
619 0 : const uint16_t c_inf_result = _uint16_sels(is_a_inf_msb, c_inf, c_normal);
620 : const uint16_t c_zero_result =
621 0 : _uint16_andc(c_inf_result, is_diff_exactly_zero);
622 : const uint16_t c_result =
623 0 : _uint16_sels(is_invalid_inf_op_msb, h_snan, c_zero_result);
624 :
625 0 : return (c_result);
626 : }
627 :
628 : // half_mul
629 : // --------
630 : //
631 : // May have 0 or 1 ulp difference from the following result:
632 : // (Round to nearest)
633 : // NOTE: Rounding mode differs between conversion and multiply
634 : //
635 : // union FLOAT_32
636 : // {
637 : // float f32;
638 : // uint32_t u32;
639 : // };
640 : //
641 : // union FLOAT_32 fx = { .u32 = half_to_float( x ) };
642 : // union FLOAT_32 fy = { .u32 = half_to_float( y ) };
643 : // union FLOAT_32 fz = { .f32 = fx.f32 * fy.f32 };
644 : // uint16_t z = float_to_half( fz );
645 : //
646 0 : uint16_t half_mul(uint16_t x, uint16_t y)
647 : {
648 0 : const uint32_t one = _uint32_li(0x00000001);
649 0 : const uint32_t h_s_mask = _uint32_li(0x00008000);
650 0 : const uint32_t h_e_mask = _uint32_li(0x00007c00);
651 0 : const uint32_t h_m_mask = _uint32_li(0x000003ff);
652 0 : const uint32_t h_m_hidden = _uint32_li(0x00000400);
653 0 : const uint32_t h_e_pos = _uint32_li(0x0000000a);
654 0 : const uint32_t h_e_bias = _uint32_li(0x0000000f);
655 0 : const uint32_t h_m_bit_count = _uint32_li(0x0000000a);
656 0 : const uint32_t h_m_bit_half_count = _uint32_li(0x00000005);
657 0 : const uint32_t h_nan_min = _uint32_li(0x00007c01);
658 0 : const uint32_t h_e_mask_minus_one = _uint32_li(0x00007bff);
659 0 : const uint32_t h_snan = _uint32_li(0x0000fe00);
660 0 : const uint32_t m_round_overflow_bit = _uint32_li(0x00000020);
661 0 : const uint32_t m_hidden_bit = _uint32_li(0x00100000);
662 0 : const uint32_t a_s = _uint32_and(x, h_s_mask);
663 0 : const uint32_t b_s = _uint32_and(y, h_s_mask);
664 0 : const uint32_t c_s = _uint32_xor(a_s, b_s);
665 0 : const uint32_t x_e = _uint32_and(x, h_e_mask);
666 0 : const uint32_t x_e_eqz_msb = _uint32_dec(x_e);
667 0 : const uint32_t a = _uint32_sels(x_e_eqz_msb, y, x);
668 0 : const uint32_t b = _uint32_sels(x_e_eqz_msb, x, y);
669 0 : const uint32_t a_e = _uint32_and(a, h_e_mask);
670 0 : const uint32_t b_e = _uint32_and(b, h_e_mask);
671 0 : const uint32_t a_m = _uint32_and(a, h_m_mask);
672 0 : const uint32_t b_m = _uint32_and(b, h_m_mask);
673 0 : const uint32_t a_e_amount = _uint32_srl(a_e, h_e_pos);
674 0 : const uint32_t b_e_amount = _uint32_srl(b_e, h_e_pos);
675 0 : const uint32_t a_m_with_hidden = _uint32_or(a_m, h_m_hidden);
676 0 : const uint32_t b_m_with_hidden = _uint32_or(b_m, h_m_hidden);
677 0 : const uint32_t c_m_normal = _uint32_mul(a_m_with_hidden, b_m_with_hidden);
678 0 : const uint32_t c_m_denorm_biased = _uint32_mul(a_m_with_hidden, b_m);
679 0 : const uint32_t c_e_denorm_unbias_e = _uint32_sub(h_e_bias, a_e_amount);
680 : const uint32_t c_m_denorm_round_amount =
681 0 : _uint32_and(c_m_denorm_biased, h_m_mask);
682 : const uint32_t c_m_denorm_rounded =
683 0 : _uint32_add(c_m_denorm_biased, c_m_denorm_round_amount);
684 : const uint32_t c_m_denorm_inplace =
685 0 : _uint32_srl(c_m_denorm_rounded, h_m_bit_count);
686 : const uint32_t c_m_denorm_unbiased =
687 0 : _uint32_srl(c_m_denorm_inplace, c_e_denorm_unbias_e);
688 0 : const uint32_t c_m_denorm = _uint32_and(c_m_denorm_unbiased, h_m_mask);
689 0 : const uint32_t c_e_amount_biased = _uint32_add(a_e_amount, b_e_amount);
690 : const uint32_t c_e_amount_unbiased =
691 0 : _uint32_sub(c_e_amount_biased, h_e_bias);
692 0 : const uint32_t is_c_e_unbiased_underflow = _uint32_ext(c_e_amount_unbiased);
693 0 : const uint32_t c_e_underflow_half_sa = _uint32_neg(c_e_amount_unbiased);
694 0 : const uint32_t c_e_underflow_sa = _uint32_sll(c_e_underflow_half_sa, one);
695 0 : const uint32_t c_m_underflow = _uint32_srl(c_m_normal, c_e_underflow_sa);
696 : const uint32_t c_e_underflow_added =
697 0 : _uint32_andc(c_e_amount_unbiased, is_c_e_unbiased_underflow);
698 : const uint32_t c_m_underflow_added =
699 0 : _uint32_selb(is_c_e_unbiased_underflow, c_m_underflow, c_m_normal);
700 : const uint32_t is_mul_overflow_test =
701 0 : _uint32_and(c_e_underflow_added, m_round_overflow_bit);
702 0 : const uint32_t is_mul_overflow_msb = _uint32_neg(is_mul_overflow_test);
703 0 : const uint32_t c_e_norm_radix_corrected = _uint32_inc(c_e_underflow_added);
704 : const uint32_t c_m_norm_radix_corrected =
705 0 : _uint32_srl(c_m_underflow_added, one);
706 : const uint32_t c_m_norm_hidden_bit =
707 0 : _uint32_and(c_m_norm_radix_corrected, m_hidden_bit);
708 0 : const uint32_t is_c_m_norm_no_hidden_msb = _uint32_dec(c_m_norm_hidden_bit);
709 : const uint32_t c_m_norm_lo =
710 0 : _uint32_srl(c_m_norm_radix_corrected, h_m_bit_half_count);
711 0 : const uint32_t c_m_norm_lo_nlz = _uint16_cntlz(c_m_norm_lo);
712 : const uint32_t is_c_m_hidden_nunderflow_msb =
713 0 : _uint32_sub(c_m_norm_lo_nlz, c_e_norm_radix_corrected);
714 : const uint32_t is_c_m_hidden_underflow_msb =
715 0 : _uint32_not(is_c_m_hidden_nunderflow_msb);
716 : const uint32_t is_c_m_hidden_underflow =
717 0 : _uint32_ext(is_c_m_hidden_underflow_msb);
718 : const uint32_t c_m_hidden_underflow_normalized_sa =
719 0 : _uint32_srl(c_m_norm_lo_nlz, one);
720 : const uint32_t c_m_hidden_underflow_normalized =
721 0 : _uint32_sll(c_m_norm_radix_corrected,
722 0 : c_m_hidden_underflow_normalized_sa);
723 : const uint32_t c_m_hidden_normalized =
724 0 : _uint32_sll(c_m_norm_radix_corrected, c_m_norm_lo_nlz);
725 : const uint32_t c_e_hidden_normalized =
726 0 : _uint32_sub(c_e_norm_radix_corrected, c_m_norm_lo_nlz);
727 : const uint32_t c_e_hidden =
728 0 : _uint32_andc(c_e_hidden_normalized, is_c_m_hidden_underflow);
729 : const uint32_t c_m_hidden =
730 : _uint32_sels(is_c_m_hidden_underflow_msb,
731 0 : c_m_hidden_underflow_normalized, c_m_hidden_normalized);
732 : const uint32_t c_m_normalized =
733 : _uint32_sels(is_c_m_norm_no_hidden_msb, c_m_hidden,
734 0 : c_m_norm_radix_corrected);
735 : const uint32_t c_e_normalized =
736 : _uint32_sels(is_c_m_norm_no_hidden_msb, c_e_hidden,
737 0 : c_e_norm_radix_corrected);
738 : const uint32_t c_m_norm_round_amount =
739 0 : _uint32_and(c_m_normalized, h_m_mask);
740 : const uint32_t c_m_norm_rounded =
741 0 : _uint32_add(c_m_normalized, c_m_norm_round_amount);
742 : const uint32_t is_round_overflow_test =
743 0 : _uint32_and(c_e_normalized, m_round_overflow_bit);
744 0 : const uint32_t is_round_overflow_msb = _uint32_neg(is_round_overflow_test);
745 : const uint32_t c_m_norm_inplace =
746 0 : _uint32_srl(c_m_norm_rounded, h_m_bit_count);
747 0 : const uint32_t c_m = _uint32_and(c_m_norm_inplace, h_m_mask);
748 0 : const uint32_t c_e_norm_inplace = _uint32_sll(c_e_normalized, h_e_pos);
749 0 : const uint32_t c_e = _uint32_and(c_e_norm_inplace, h_e_mask);
750 0 : const uint32_t c_em_nan = _uint32_or(h_e_mask, a_m);
751 0 : const uint32_t c_nan = _uint32_or(a_s, c_em_nan);
752 0 : const uint32_t c_denorm = _uint32_or(c_s, c_m_denorm);
753 0 : const uint32_t c_inf = _uint32_or(c_s, h_e_mask);
754 0 : const uint32_t c_em_norm = _uint32_or(c_e, c_m);
755 0 : const uint32_t is_a_e_flagged_msb = _uint32_sub(h_e_mask_minus_one, a_e);
756 0 : const uint32_t is_b_e_flagged_msb = _uint32_sub(h_e_mask_minus_one, b_e);
757 0 : const uint32_t is_a_e_eqz_msb = _uint32_dec(a_e);
758 0 : const uint32_t is_a_m_eqz_msb = _uint32_dec(a_m);
759 0 : const uint32_t is_b_e_eqz_msb = _uint32_dec(b_e);
760 0 : const uint32_t is_b_m_eqz_msb = _uint32_dec(b_m);
761 0 : const uint32_t is_b_eqz_msb = _uint32_and(is_b_e_eqz_msb, is_b_m_eqz_msb);
762 0 : const uint32_t is_a_eqz_msb = _uint32_and(is_a_e_eqz_msb, is_a_m_eqz_msb);
763 : const uint32_t is_c_nan_via_a_msb =
764 0 : _uint32_andc(is_a_e_flagged_msb, is_b_e_flagged_msb);
765 : const uint32_t is_c_nan_via_b_msb =
766 0 : _uint32_andc(is_b_e_flagged_msb, is_b_m_eqz_msb);
767 : const uint32_t is_c_nan_msb =
768 0 : _uint32_or(is_c_nan_via_a_msb, is_c_nan_via_b_msb);
769 : const uint32_t is_c_denorm_msb =
770 0 : _uint32_andc(is_b_e_eqz_msb, is_a_e_flagged_msb);
771 : const uint32_t is_a_inf_msb =
772 0 : _uint32_and(is_a_e_flagged_msb, is_a_m_eqz_msb);
773 0 : const uint32_t is_c_snan_msb = _uint32_and(is_a_inf_msb, is_b_eqz_msb);
774 : const uint32_t is_c_nan_min_via_a_msb =
775 0 : _uint32_and(is_a_e_flagged_msb, is_b_eqz_msb);
776 : const uint32_t is_c_nan_min_via_b_msb =
777 0 : _uint32_and(is_b_e_flagged_msb, is_a_eqz_msb);
778 : const uint32_t is_c_nan_min_msb =
779 0 : _uint32_or(is_c_nan_min_via_a_msb, is_c_nan_min_via_b_msb);
780 : const uint32_t is_c_inf_msb =
781 0 : _uint32_or(is_a_e_flagged_msb, is_b_e_flagged_msb);
782 : const uint32_t is_overflow_msb =
783 0 : _uint32_or(is_round_overflow_msb, is_mul_overflow_msb);
784 : const uint32_t c_em_overflow_result =
785 0 : _uint32_sels(is_overflow_msb, h_e_mask, c_em_norm);
786 0 : const uint32_t c_common_result = _uint32_or(c_s, c_em_overflow_result);
787 : const uint32_t c_zero_result =
788 0 : _uint32_sels(is_b_eqz_msb, c_s, c_common_result);
789 : const uint32_t c_nan_result =
790 0 : _uint32_sels(is_c_nan_msb, c_nan, c_zero_result);
791 : const uint32_t c_nan_min_result =
792 0 : _uint32_sels(is_c_nan_min_msb, h_nan_min, c_nan_result);
793 : const uint32_t c_inf_result =
794 0 : _uint32_sels(is_c_inf_msb, c_inf, c_nan_min_result);
795 : const uint32_t c_denorm_result =
796 0 : _uint32_sels(is_c_denorm_msb, c_denorm, c_inf_result);
797 : const uint32_t c_result =
798 0 : _uint32_sels(is_c_snan_msb, h_snan, c_denorm_result);
799 :
800 0 : return (uint16_t)(c_result);
801 : }
|