softfloat.c@ 13728

最後變更在這個檔案從13728是 1,由 vboxsync 提交於 55 年前
import
檔案大小: 188.5 KB

行
1
2	/*============================================================================
3
4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5	Package, Release 2b.
6
7	Written by John R. Hauser. This work was made possible in part by the
8	International Computer Science Institute, located at Suite 600, 1947 Center
9	Street, Berkeley, California 94704. Funding was partially provided by the
10	National Science Foundation under grant MIP-9311980. The original version
11	of this code was written as part of a project to build a fixed-point vector
12	processor in collaboration with the University of California at Berkeley,
13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15	arithmetic/SoftFloat.html'.
16
17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26	Derivative works are acceptable, even for commercial purposes, so long as
27	(1) the source code for the derivative work includes prominent notice that
28	the work is derivative, and (2) the source code includes prominent notice with
29	these four paragraphs for those parts of this code that are retained.
30
31	=============================================================================*/
32
33	#include "softfloat.h"
34
35	/*----------------------------------------------------------------------------
36	\| Primitive arithmetic functions, including multi-word arithmetic, and
37	\| division and square root approximations. (Can be specialized to target if
38	\| desired.)
39	----------------------------------------------------------------------------/
40	#include "softfloat-macros.h"
41
42	/*----------------------------------------------------------------------------
43	\| Functions and definitions to determine: (1) whether tininess for underflow
44	\| is detected before or after rounding by default, (2) what (if anything)
45	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47	\| are propagated from function inputs to output. These details are target-
48	\| specific.
49	----------------------------------------------------------------------------/
50	#include "softfloat-specialize.h"
51
52	void set_float_rounding_mode(int val STATUS_PARAM)
53	{
54	STATUS(float_rounding_mode) = val;
55	}
56
57	void set_float_exception_flags(int val STATUS_PARAM)
58	{
59	STATUS(float_exception_flags) = val;
60	}
61
62	#ifdef FLOATX80
63	void set_floatx80_rounding_precision(int val STATUS_PARAM)
64	{
65	STATUS(floatx80_rounding_precision) = val;
66	}
67	#endif
68
69	/*----------------------------------------------------------------------------
70	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
72	\| input. If `zSign' is 1, the input is negated before being converted to an
73	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74	\| is simply rounded to an integer, with the inexact exception raised if the
75	\| input cannot be represented exactly as an integer. However, if the fixed-
76	\| point input is too large, the invalid exception is raised and the largest
77	\| positive or negative integer is returned.
78	----------------------------------------------------------------------------/
79
80	static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81	{
82	int8 roundingMode;
83	flag roundNearestEven;
84	int8 roundIncrement, roundBits;
85	int32 z;
86
87	roundingMode = STATUS(float_rounding_mode);
88	roundNearestEven = ( roundingMode == float_round_nearest_even );
89	roundIncrement = 0x40;
90	if ( ! roundNearestEven ) {
91	if ( roundingMode == float_round_to_zero ) {
92	roundIncrement = 0;
93	}
94	else {
95	roundIncrement = 0x7F;
96	if ( zSign ) {
97	if ( roundingMode == float_round_up ) roundIncrement = 0;
98	}
99	else {
100	if ( roundingMode == float_round_down ) roundIncrement = 0;
101	}
102	}
103	}
104	roundBits = absZ & 0x7F;
105	absZ = ( absZ + roundIncrement )>>7;
106	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107	z = absZ;
108	if ( zSign ) z = - z;
109	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110	float_raise( float_flag_invalid STATUS_VAR);
111	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112	}
113	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
114	return z;
115
116	}
117
118	/*----------------------------------------------------------------------------
119	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
121	\| and returns the properly rounded 64-bit integer corresponding to the input.
122	\| If `zSign' is 1, the input is negated before being converted to an integer.
123	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
124	\| the inexact exception raised if the input cannot be represented exactly as
125	\| an integer. However, if the fixed-point input is too large, the invalid
126	\| exception is raised and the largest positive or negative integer is
127	\| returned.
128	----------------------------------------------------------------------------/
129
130	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131	{
132	int8 roundingMode;
133	flag roundNearestEven, increment;
134	int64 z;
135
136	roundingMode = STATUS(float_rounding_mode);
137	roundNearestEven = ( roundingMode == float_round_nearest_even );
138	increment = ( (sbits64) absZ1 < 0 );
139	if ( ! roundNearestEven ) {
140	if ( roundingMode == float_round_to_zero ) {
141	increment = 0;
142	}
143	else {
144	if ( zSign ) {
145	increment = ( roundingMode == float_round_down ) && absZ1;
146	}
147	else {
148	increment = ( roundingMode == float_round_up ) && absZ1;
149	}
150	}
151	}
152	if ( increment ) {
153	++absZ0;
154	if ( absZ0 == 0 ) goto overflow;
155	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156	}
157	z = absZ0;
158	if ( zSign ) z = - z;
159	if ( z && ( ( z < 0 ) ^ zSign ) ) {
160	overflow:
161	float_raise( float_flag_invalid STATUS_VAR);
162	return
163	zSign ? (sbits64) LIT64( 0x8000000000000000 )
164	: LIT64( 0x7FFFFFFFFFFFFFFF );
165	}
166	if ( absZ1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
167	return z;
168
169	}
170
171	/*----------------------------------------------------------------------------
172	\| Returns the fraction bits of the single-precision floating-point value `a'.
173	----------------------------------------------------------------------------/
174
175	INLINE bits32 extractFloat32Frac( float32 a )
176	{
177
178	return a & 0x007FFFFF;
179
180	}
181
182	/*----------------------------------------------------------------------------
183	\| Returns the exponent bits of the single-precision floating-point value `a'.
184	----------------------------------------------------------------------------/
185
186	INLINE int16 extractFloat32Exp( float32 a )
187	{
188
189	return ( a>>23 ) & 0xFF;
190
191	}
192
193	/*----------------------------------------------------------------------------
194	\| Returns the sign bit of the single-precision floating-point value `a'.
195	----------------------------------------------------------------------------/
196
197	INLINE flag extractFloat32Sign( float32 a )
198	{
199
200	return a>>31;
201
202	}
203
204	/*----------------------------------------------------------------------------
205	\| Normalizes the subnormal single-precision floating-point value represented
206	\| by the denormalized significand `aSig'. The normalized exponent and
207	\| significand are stored at the locations pointed to by `zExpPtr' and
208	\| `zSigPtr', respectively.
209	----------------------------------------------------------------------------/
210
211	static void
212	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
213	{
214	int8 shiftCount;
215
216	shiftCount = countLeadingZeros32( aSig ) - 8;
217	*zSigPtr = aSig<<shiftCount;
218	*zExpPtr = 1 - shiftCount;
219
220	}
221
222	/*----------------------------------------------------------------------------
223	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224	\| single-precision floating-point value, returning the result. After being
225	\| shifted into the proper positions, the three fields are simply added
226	\| together to form the result. This means that any integer portion of `zSig'
227	\| will be added into the exponent. Since a properly normalized significand
228	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
229	\| than the desired result exponent whenever `zSig' is a complete, normalized
230	\| significand.
231	----------------------------------------------------------------------------/
232
233	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234	{
235
236	return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237
238	}
239
240	/*----------------------------------------------------------------------------
241	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242	\| and significand `zSig', and returns the proper single-precision floating-
243	\| point value corresponding to the abstract input. Ordinarily, the abstract
244	\| value is simply rounded and packed into the single-precision format, with
245	\| the inexact exception raised if the abstract input cannot be represented
246	\| exactly. However, if the abstract value is too large, the overflow and
247	\| inexact exceptions are raised and an infinity or maximal finite value is
248	\| returned. If the abstract value is too small, the input value is rounded to
249	\| a subnormal number, and the underflow and inexact exceptions are raised if
250	\| the abstract input cannot be represented exactly as a subnormal single-
251	\| precision floating-point number.
252	\| The input significand `zSig' has its binary point between bits 30
253	\| and 29, which is 7 bits to the left of the usual location. This shifted
254	\| significand must be normalized or smaller. If `zSig' is not normalized,
255	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
256	\| and it must not require rounding. In the usual case that `zSig' is
257	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
259	\| Binary Floating-Point Arithmetic.
260	----------------------------------------------------------------------------/
261
262	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263	{
264	int8 roundingMode;
265	flag roundNearestEven;
266	int8 roundIncrement, roundBits;
267	flag isTiny;
268
269	roundingMode = STATUS(float_rounding_mode);
270	roundNearestEven = ( roundingMode == float_round_nearest_even );
271	roundIncrement = 0x40;
272	if ( ! roundNearestEven ) {
273	if ( roundingMode == float_round_to_zero ) {
274	roundIncrement = 0;
275	}
276	else {
277	roundIncrement = 0x7F;
278	if ( zSign ) {
279	if ( roundingMode == float_round_up ) roundIncrement = 0;
280	}
281	else {
282	if ( roundingMode == float_round_down ) roundIncrement = 0;
283	}
284	}
285	}
286	roundBits = zSig & 0x7F;
287	if ( 0xFD <= (bits16) zExp ) {
288	if ( ( 0xFD < zExp )
289	\|\| ( ( zExp == 0xFD )
290	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291	) {
292	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
293	return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294	}
295	if ( zExp < 0 ) {
296	isTiny =
297	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298	\|\| ( zExp < -1 )
299	\|\| ( zSig + roundIncrement < 0x80000000 );
300	shift32RightJamming( zSig, - zExp, &zSig );
301	zExp = 0;
302	roundBits = zSig & 0x7F;
303	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304	}
305	}
306	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
307	zSig = ( zSig + roundIncrement )>>7;
308	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309	if ( zSig == 0 ) zExp = 0;
310	return packFloat32( zSign, zExp, zSig );
311
312	}
313
314	/*----------------------------------------------------------------------------
315	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316	\| and significand `zSig', and returns the proper single-precision floating-
317	\| point value corresponding to the abstract input. This routine is just like
318	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320	\| floating-point exponent.
321	----------------------------------------------------------------------------/
322
323	static float32
324	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325	{
326	int8 shiftCount;
327
328	shiftCount = countLeadingZeros32( zSig ) - 1;
329	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330
331	}
332
333	/*----------------------------------------------------------------------------
334	\| Returns the fraction bits of the double-precision floating-point value `a'.
335	----------------------------------------------------------------------------/
336
337	INLINE bits64 extractFloat64Frac( float64 a )
338	{
339
340	return a & LIT64( 0x000FFFFFFFFFFFFF );
341
342	}
343
344	/*----------------------------------------------------------------------------
345	\| Returns the exponent bits of the double-precision floating-point value `a'.
346	----------------------------------------------------------------------------/
347
348	INLINE int16 extractFloat64Exp( float64 a )
349	{
350
351	return ( a>>52 ) & 0x7FF;
352
353	}
354
355	/*----------------------------------------------------------------------------
356	\| Returns the sign bit of the double-precision floating-point value `a'.
357	----------------------------------------------------------------------------/
358
359	INLINE flag extractFloat64Sign( float64 a )
360	{
361
362	return a>>63;
363
364	}
365
366	/*----------------------------------------------------------------------------
367	\| Normalizes the subnormal double-precision floating-point value represented
368	\| by the denormalized significand `aSig'. The normalized exponent and
369	\| significand are stored at the locations pointed to by `zExpPtr' and
370	\| `zSigPtr', respectively.
371	----------------------------------------------------------------------------/
372
373	static void
374	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
375	{
376	int8 shiftCount;
377
378	shiftCount = countLeadingZeros64( aSig ) - 11;
379	*zSigPtr = aSig<<shiftCount;
380	*zExpPtr = 1 - shiftCount;
381
382	}
383
384	/*----------------------------------------------------------------------------
385	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386	\| double-precision floating-point value, returning the result. After being
387	\| shifted into the proper positions, the three fields are simply added
388	\| together to form the result. This means that any integer portion of `zSig'
389	\| will be added into the exponent. Since a properly normalized significand
390	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
391	\| than the desired result exponent whenever `zSig' is a complete, normalized
392	\| significand.
393	----------------------------------------------------------------------------/
394
395	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396	{
397
398	return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399
400	}
401
402	/*----------------------------------------------------------------------------
403	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404	\| and significand `zSig', and returns the proper double-precision floating-
405	\| point value corresponding to the abstract input. Ordinarily, the abstract
406	\| value is simply rounded and packed into the double-precision format, with
407	\| the inexact exception raised if the abstract input cannot be represented
408	\| exactly. However, if the abstract value is too large, the overflow and
409	\| inexact exceptions are raised and an infinity or maximal finite value is
410	\| returned. If the abstract value is too small, the input value is rounded
411	\| to a subnormal number, and the underflow and inexact exceptions are raised
412	\| if the abstract input cannot be represented exactly as a subnormal double-
413	\| precision floating-point number.
414	\| The input significand `zSig' has its binary point between bits 62
415	\| and 61, which is 10 bits to the left of the usual location. This shifted
416	\| significand must be normalized or smaller. If `zSig' is not normalized,
417	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
418	\| and it must not require rounding. In the usual case that `zSig' is
419	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
421	\| Binary Floating-Point Arithmetic.
422	----------------------------------------------------------------------------/
423
424	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425	{
426	int8 roundingMode;
427	flag roundNearestEven;
428	int16 roundIncrement, roundBits;
429	flag isTiny;
430
431	roundingMode = STATUS(float_rounding_mode);
432	roundNearestEven = ( roundingMode == float_round_nearest_even );
433	roundIncrement = 0x200;
434	if ( ! roundNearestEven ) {
435	if ( roundingMode == float_round_to_zero ) {
436	roundIncrement = 0;
437	}
438	else {
439	roundIncrement = 0x3FF;
440	if ( zSign ) {
441	if ( roundingMode == float_round_up ) roundIncrement = 0;
442	}
443	else {
444	if ( roundingMode == float_round_down ) roundIncrement = 0;
445	}
446	}
447	}
448	roundBits = zSig & 0x3FF;
449	if ( 0x7FD <= (bits16) zExp ) {
450	if ( ( 0x7FD < zExp )
451	\|\| ( ( zExp == 0x7FD )
452	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453	) {
454	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
455	return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456	}
457	if ( zExp < 0 ) {
458	isTiny =
459	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460	\|\| ( zExp < -1 )
461	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462	shift64RightJamming( zSig, - zExp, &zSig );
463	zExp = 0;
464	roundBits = zSig & 0x3FF;
465	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466	}
467	}
468	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
469	zSig = ( zSig + roundIncrement )>>10;
470	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471	if ( zSig == 0 ) zExp = 0;
472	return packFloat64( zSign, zExp, zSig );
473
474	}
475
476	/*----------------------------------------------------------------------------
477	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478	\| and significand `zSig', and returns the proper double-precision floating-
479	\| point value corresponding to the abstract input. This routine is just like
480	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482	\| floating-point exponent.
483	----------------------------------------------------------------------------/
484
485	static float64
486	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487	{
488	int8 shiftCount;
489
490	shiftCount = countLeadingZeros64( zSig ) - 1;
491	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492
493	}
494
495	#ifdef FLOATX80
496
497	/*----------------------------------------------------------------------------
498	\| Returns the fraction bits of the extended double-precision floating-point
499	\| value `a'.
500	----------------------------------------------------------------------------/
501
502	INLINE bits64 extractFloatx80Frac( floatx80 a )
503	{
504
505	return a.low;
506
507	}
508
509	/*----------------------------------------------------------------------------
510	\| Returns the exponent bits of the extended double-precision floating-point
511	\| value `a'.
512	----------------------------------------------------------------------------/
513
514	INLINE int32 extractFloatx80Exp( floatx80 a )
515	{
516
517	return a.high & 0x7FFF;
518
519	}
520
521	/*----------------------------------------------------------------------------
522	\| Returns the sign bit of the extended double-precision floating-point value
523	\| `a'.
524	----------------------------------------------------------------------------/
525
526	INLINE flag extractFloatx80Sign( floatx80 a )
527	{
528
529	return a.high>>15;
530
531	}
532
533	/*----------------------------------------------------------------------------
534	\| Normalizes the subnormal extended double-precision floating-point value
535	\| represented by the denormalized significand `aSig'. The normalized exponent
536	\| and significand are stored at the locations pointed to by `zExpPtr' and
537	\| `zSigPtr', respectively.
538	----------------------------------------------------------------------------/
539
540	static void
541	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
542	{
543	int8 shiftCount;
544
545	shiftCount = countLeadingZeros64( aSig );
546	*zSigPtr = aSig<<shiftCount;
547	*zExpPtr = 1 - shiftCount;
548
549	}
550
551	/*----------------------------------------------------------------------------
552	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553	\| extended double-precision floating-point value, returning the result.
554	----------------------------------------------------------------------------/
555
556	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557	{
558	floatx80 z;
559
560	z.low = zSig;
561	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562	return z;
563
564	}
565
566	/*----------------------------------------------------------------------------
567	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569	\| and returns the proper extended double-precision floating-point value
570	\| corresponding to the abstract input. Ordinarily, the abstract value is
571	\| rounded and packed into the extended double-precision format, with the
572	\| inexact exception raised if the abstract input cannot be represented
573	\| exactly. However, if the abstract value is too large, the overflow and
574	\| inexact exceptions are raised and an infinity or maximal finite value is
575	\| returned. If the abstract value is too small, the input value is rounded to
576	\| a subnormal number, and the underflow and inexact exceptions are raised if
577	\| the abstract input cannot be represented exactly as a subnormal extended
578	\| double-precision floating-point number.
579	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
580	\| number of bits as single or double precision, respectively. Otherwise, the
581	\| result is rounded to the full precision of the extended double-precision
582	\| format.
583	\| The input significand must be normalized or smaller. If the input
584	\| significand is not normalized, `zExp' must be 0; in that case, the result
585	\| returned is a subnormal number, and it must not require rounding. The
586	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587	\| Floating-Point Arithmetic.
588	----------------------------------------------------------------------------/
589
590	static floatx80
591	roundAndPackFloatx80(
592	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593	STATUS_PARAM)
594	{
595	int8 roundingMode;
596	flag roundNearestEven, increment, isTiny;
597	int64 roundIncrement, roundMask, roundBits;
598
599	roundingMode = STATUS(float_rounding_mode);
600	roundNearestEven = ( roundingMode == float_round_nearest_even );
601	if ( roundingPrecision == 80 ) goto precision80;
602	if ( roundingPrecision == 64 ) {
603	roundIncrement = LIT64( 0x0000000000000400 );
604	roundMask = LIT64( 0x00000000000007FF );
605	}
606	else if ( roundingPrecision == 32 ) {
607	roundIncrement = LIT64( 0x0000008000000000 );
608	roundMask = LIT64( 0x000000FFFFFFFFFF );
609	}
610	else {
611	goto precision80;
612	}
613	zSig0 \|= ( zSig1 != 0 );
614	if ( ! roundNearestEven ) {
615	if ( roundingMode == float_round_to_zero ) {
616	roundIncrement = 0;
617	}
618	else {
619	roundIncrement = roundMask;
620	if ( zSign ) {
621	if ( roundingMode == float_round_up ) roundIncrement = 0;
622	}
623	else {
624	if ( roundingMode == float_round_down ) roundIncrement = 0;
625	}
626	}
627	}
628	roundBits = zSig0 & roundMask;
629	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630	if ( ( 0x7FFE < zExp )
631	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632	) {
633	goto overflow;
634	}
635	if ( zExp <= 0 ) {
636	isTiny =
637	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638	\|\| ( zExp < 0 )
639	\|\| ( zSig0 <= zSig0 + roundIncrement );
640	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641	zExp = 0;
642	roundBits = zSig0 & roundMask;
643	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
645	zSig0 += roundIncrement;
646	if ( (sbits64) zSig0 < 0 ) zExp = 1;
647	roundIncrement = roundMask + 1;
648	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649	roundMask \|= roundIncrement;
650	}
651	zSig0 &= ~ roundMask;
652	return packFloatx80( zSign, zExp, zSig0 );
653	}
654	}
655	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
656	zSig0 += roundIncrement;
657	if ( zSig0 < roundIncrement ) {
658	++zExp;
659	zSig0 = LIT64( 0x8000000000000000 );
660	}
661	roundIncrement = roundMask + 1;
662	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663	roundMask \|= roundIncrement;
664	}
665	zSig0 &= ~ roundMask;
666	if ( zSig0 == 0 ) zExp = 0;
667	return packFloatx80( zSign, zExp, zSig0 );
668	precision80:
669	increment = ( (sbits64) zSig1 < 0 );
670	if ( ! roundNearestEven ) {
671	if ( roundingMode == float_round_to_zero ) {
672	increment = 0;
673	}
674	else {
675	if ( zSign ) {
676	increment = ( roundingMode == float_round_down ) && zSig1;
677	}
678	else {
679	increment = ( roundingMode == float_round_up ) && zSig1;
680	}
681	}
682	}
683	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684	if ( ( 0x7FFE < zExp )
685	\|\| ( ( zExp == 0x7FFE )
686	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687	&& increment
688	)
689	) {
690	roundMask = 0;
691	overflow:
692	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
693	if ( ( roundingMode == float_round_to_zero )
694	\|\| ( zSign && ( roundingMode == float_round_up ) )
695	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
696	) {
697	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698	}
699	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700	}
701	if ( zExp <= 0 ) {
702	isTiny =
703	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704	\|\| ( zExp < 0 )
705	\|\| ! increment
706	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708	zExp = 0;
709	if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
711	if ( roundNearestEven ) {
712	increment = ( (sbits64) zSig1 < 0 );
713	}
714	else {
715	if ( zSign ) {
716	increment = ( roundingMode == float_round_down ) && zSig1;
717	}
718	else {
719	increment = ( roundingMode == float_round_up ) && zSig1;
720	}
721	}
722	if ( increment ) {
723	++zSig0;
724	zSig0 &=
725	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726	if ( (sbits64) zSig0 < 0 ) zExp = 1;
727	}
728	return packFloatx80( zSign, zExp, zSig0 );
729	}
730	}
731	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
732	if ( increment ) {
733	++zSig0;
734	if ( zSig0 == 0 ) {
735	++zExp;
736	zSig0 = LIT64( 0x8000000000000000 );
737	}
738	else {
739	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740	}
741	}
742	else {
743	if ( zSig0 == 0 ) zExp = 0;
744	}
745	return packFloatx80( zSign, zExp, zSig0 );
746
747	}
748
749	/*----------------------------------------------------------------------------
750	\| Takes an abstract floating-point value having sign `zSign', exponent
751	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752	\| and returns the proper extended double-precision floating-point value
753	\| corresponding to the abstract input. This routine is just like
754	\| `roundAndPackFloatx80' except that the input significand does not have to be
755	\| normalized.
756	----------------------------------------------------------------------------/
757
758	static floatx80
759	normalizeRoundAndPackFloatx80(
760	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761	STATUS_PARAM)
762	{
763	int8 shiftCount;
764
765	if ( zSig0 == 0 ) {
766	zSig0 = zSig1;
767	zSig1 = 0;
768	zExp -= 64;
769	}
770	shiftCount = countLeadingZeros64( zSig0 );
771	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772	zExp -= shiftCount;
773	return
774	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775
776	}
777
778	#endif
779
780	#ifdef FLOAT128
781
782	/*----------------------------------------------------------------------------
783	\| Returns the least-significant 64 fraction bits of the quadruple-precision
784	\| floating-point value `a'.
785	----------------------------------------------------------------------------/
786
787	INLINE bits64 extractFloat128Frac1( float128 a )
788	{
789
790	return a.low;
791
792	}
793
794	/*----------------------------------------------------------------------------
795	\| Returns the most-significant 48 fraction bits of the quadruple-precision
796	\| floating-point value `a'.
797	----------------------------------------------------------------------------/
798
799	INLINE bits64 extractFloat128Frac0( float128 a )
800	{
801
802	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803
804	}
805
806	/*----------------------------------------------------------------------------
807	\| Returns the exponent bits of the quadruple-precision floating-point value
808	\| `a'.
809	----------------------------------------------------------------------------/
810
811	INLINE int32 extractFloat128Exp( float128 a )
812	{
813
814	return ( a.high>>48 ) & 0x7FFF;
815
816	}
817
818	/*----------------------------------------------------------------------------
819	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
820	----------------------------------------------------------------------------/
821
822	INLINE flag extractFloat128Sign( float128 a )
823	{
824
825	return a.high>>63;
826
827	}
828
829	/*----------------------------------------------------------------------------
830	\| Normalizes the subnormal quadruple-precision floating-point value
831	\| represented by the denormalized significand formed by the concatenation of
832	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
833	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834	\| significand are stored at the location pointed to by `zSig0Ptr', and the
835	\| least significant 64 bits of the normalized significand are stored at the
836	\| location pointed to by `zSig1Ptr'.
837	----------------------------------------------------------------------------/
838
839	static void
840	normalizeFloat128Subnormal(
841	bits64 aSig0,
842	bits64 aSig1,
843	int32 *zExpPtr,
844	bits64 *zSig0Ptr,
845	bits64 *zSig1Ptr
846	)
847	{
848	int8 shiftCount;
849
850	if ( aSig0 == 0 ) {
851	shiftCount = countLeadingZeros64( aSig1 ) - 15;
852	if ( shiftCount < 0 ) {
853	*zSig0Ptr = aSig1>>( - shiftCount );
854	*zSig1Ptr = aSig1<<( shiftCount & 63 );
855	}
856	else {
857	*zSig0Ptr = aSig1<<shiftCount;
858	*zSig1Ptr = 0;
859	}
860	*zExpPtr = - shiftCount - 63;
861	}
862	else {
863	shiftCount = countLeadingZeros64( aSig0 ) - 15;
864	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865	*zExpPtr = 1 - shiftCount;
866	}
867
868	}
869
870	/*----------------------------------------------------------------------------
871	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
872	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873	\| floating-point value, returning the result. After being shifted into the
874	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875	\| added together to form the most significant 32 bits of the result. This
876	\| means that any integer portion of `zSig0' will be added into the exponent.
877	\| Since a properly normalized significand will have an integer portion equal
878	\| to 1, the `zExp' input should be 1 less than the desired result exponent
879	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880	\| significand.
881	----------------------------------------------------------------------------/
882
883	INLINE float128
884	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885	{
886	float128 z;
887
888	z.low = zSig1;
889	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890	return z;
891
892	}
893
894	/*----------------------------------------------------------------------------
895	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
897	\| and `zSig2', and returns the proper quadruple-precision floating-point value
898	\| corresponding to the abstract input. Ordinarily, the abstract value is
899	\| simply rounded and packed into the quadruple-precision format, with the
900	\| inexact exception raised if the abstract input cannot be represented
901	\| exactly. However, if the abstract value is too large, the overflow and
902	\| inexact exceptions are raised and an infinity or maximal finite value is
903	\| returned. If the abstract value is too small, the input value is rounded to
904	\| a subnormal number, and the underflow and inexact exceptions are raised if
905	\| the abstract input cannot be represented exactly as a subnormal quadruple-
906	\| precision floating-point number.
907	\| The input significand must be normalized or smaller. If the input
908	\| significand is not normalized, `zExp' must be 0; in that case, the result
909	\| returned is a subnormal number, and it must not require rounding. In the
910	\| usual case that the input significand is normalized, `zExp' must be 1 less
911	\| than the ``true'' floating-point exponent. The handling of underflow and
912	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913	----------------------------------------------------------------------------/
914
915	static float128
916	roundAndPackFloat128(
917	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918	{
919	int8 roundingMode;
920	flag roundNearestEven, increment, isTiny;
921
922	roundingMode = STATUS(float_rounding_mode);
923	roundNearestEven = ( roundingMode == float_round_nearest_even );
924	increment = ( (sbits64) zSig2 < 0 );
925	if ( ! roundNearestEven ) {
926	if ( roundingMode == float_round_to_zero ) {
927	increment = 0;
928	}
929	else {
930	if ( zSign ) {
931	increment = ( roundingMode == float_round_down ) && zSig2;
932	}
933	else {
934	increment = ( roundingMode == float_round_up ) && zSig2;
935	}
936	}
937	}
938	if ( 0x7FFD <= (bits32) zExp ) {
939	if ( ( 0x7FFD < zExp )
940	\|\| ( ( zExp == 0x7FFD )
941	&& eq128(
942	LIT64( 0x0001FFFFFFFFFFFF ),
943	LIT64( 0xFFFFFFFFFFFFFFFF ),
944	zSig0,
945	zSig1
946	)
947	&& increment
948	)
949	) {
950	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
951	if ( ( roundingMode == float_round_to_zero )
952	\|\| ( zSign && ( roundingMode == float_round_up ) )
953	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
954	) {
955	return
956	packFloat128(
957	zSign,
958	0x7FFE,
959	LIT64( 0x0000FFFFFFFFFFFF ),
960	LIT64( 0xFFFFFFFFFFFFFFFF )
961	);
962	}
963	return packFloat128( zSign, 0x7FFF, 0, 0 );
964	}
965	if ( zExp < 0 ) {
966	isTiny =
967	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968	\|\| ( zExp < -1 )
969	\|\| ! increment
970	\|\| lt128(
971	zSig0,
972	zSig1,
973	LIT64( 0x0001FFFFFFFFFFFF ),
974	LIT64( 0xFFFFFFFFFFFFFFFF )
975	);
976	shift128ExtraRightJamming(
977	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978	zExp = 0;
979	if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980	if ( roundNearestEven ) {
981	increment = ( (sbits64) zSig2 < 0 );
982	}
983	else {
984	if ( zSign ) {
985	increment = ( roundingMode == float_round_down ) && zSig2;
986	}
987	else {
988	increment = ( roundingMode == float_round_up ) && zSig2;
989	}
990	}
991	}
992	}
993	if ( zSig2 ) STATUS(float_exception_flags) \|= float_flag_inexact;
994	if ( increment ) {
995	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997	}
998	else {
999	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
1000	}
1001	return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003	}
1004
1005	/*----------------------------------------------------------------------------
1006	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008	\| returns the proper quadruple-precision floating-point value corresponding
1009	\| to the abstract input. This routine is just like `roundAndPackFloat128'
1010	\| except that the input significand has fewer bits and does not have to be
1011	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1012	\| point exponent.
1013	----------------------------------------------------------------------------/
1014
1015	static float128
1016	normalizeRoundAndPackFloat128(
1017	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018	{
1019	int8 shiftCount;
1020	bits64 zSig2;
1021
1022	if ( zSig0 == 0 ) {
1023	zSig0 = zSig1;
1024	zSig1 = 0;
1025	zExp -= 64;
1026	}
1027	shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028	if ( 0 <= shiftCount ) {
1029	zSig2 = 0;
1030	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031	}
1032	else {
1033	shift128ExtraRightJamming(
1034	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035	}
1036	zExp -= shiftCount;
1037	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039	}
1040
1041	#endif
1042
1043	/*----------------------------------------------------------------------------
1044	\| Returns the result of converting the 32-bit two's complement integer `a'
1045	\| to the single-precision floating-point format. The conversion is performed
1046	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047	----------------------------------------------------------------------------/
1048
1049	float32 int32_to_float32( int32 a STATUS_PARAM )
1050	{
1051	flag zSign;
1052
1053	if ( a == 0 ) return 0;
1054	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055	zSign = ( a < 0 );
1056	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058	}
1059
1060	/*----------------------------------------------------------------------------
1061	\| Returns the result of converting the 32-bit two's complement integer `a'
1062	\| to the double-precision floating-point format. The conversion is performed
1063	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064	----------------------------------------------------------------------------/
1065
1066	float64 int32_to_float64( int32 a STATUS_PARAM )
1067	{
1068	flag zSign;
1069	uint32 absA;
1070	int8 shiftCount;
1071	bits64 zSig;
1072
1073	if ( a == 0 ) return 0;
1074	zSign = ( a < 0 );
1075	absA = zSign ? - a : a;
1076	shiftCount = countLeadingZeros32( absA ) + 21;
1077	zSig = absA;
1078	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080	}
1081
1082	#ifdef FLOATX80
1083
1084	/*----------------------------------------------------------------------------
1085	\| Returns the result of converting the 32-bit two's complement integer `a'
1086	\| to the extended double-precision floating-point format. The conversion
1087	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088	\| Arithmetic.
1089	----------------------------------------------------------------------------/
1090
1091	floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092	{
1093	flag zSign;
1094	uint32 absA;
1095	int8 shiftCount;
1096	bits64 zSig;
1097
1098	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099	zSign = ( a < 0 );
1100	absA = zSign ? - a : a;
1101	shiftCount = countLeadingZeros32( absA ) + 32;
1102	zSig = absA;
1103	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105	}
1106
1107	#endif
1108
1109	#ifdef FLOAT128
1110
1111	/*----------------------------------------------------------------------------
1112	\| Returns the result of converting the 32-bit two's complement integer `a' to
1113	\| the quadruple-precision floating-point format. The conversion is performed
1114	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115	----------------------------------------------------------------------------/
1116
1117	float128 int32_to_float128( int32 a STATUS_PARAM )
1118	{
1119	flag zSign;
1120	uint32 absA;
1121	int8 shiftCount;
1122	bits64 zSig0;
1123
1124	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125	zSign = ( a < 0 );
1126	absA = zSign ? - a : a;
1127	shiftCount = countLeadingZeros32( absA ) + 17;
1128	zSig0 = absA;
1129	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131	}
1132
1133	#endif
1134
1135	/*----------------------------------------------------------------------------
1136	\| Returns the result of converting the 64-bit two's complement integer `a'
1137	\| to the single-precision floating-point format. The conversion is performed
1138	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139	----------------------------------------------------------------------------/
1140
1141	float32 int64_to_float32( int64 a STATUS_PARAM )
1142	{
1143	flag zSign;
1144	uint64 absA;
1145	int8 shiftCount;
1146
1147	if ( a == 0 ) return 0;
1148	zSign = ( a < 0 );
1149	absA = zSign ? - a : a;
1150	shiftCount = countLeadingZeros64( absA ) - 40;
1151	if ( 0 <= shiftCount ) {
1152	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153	}
1154	else {
1155	shiftCount += 7;
1156	if ( shiftCount < 0 ) {
1157	shift64RightJamming( absA, - shiftCount, &absA );
1158	}
1159	else {
1160	absA <<= shiftCount;
1161	}
1162	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163	}
1164
1165	}
1166
1167	/*----------------------------------------------------------------------------
1168	\| Returns the result of converting the 64-bit two's complement integer `a'
1169	\| to the double-precision floating-point format. The conversion is performed
1170	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171	----------------------------------------------------------------------------/
1172
1173	float64 int64_to_float64( int64 a STATUS_PARAM )
1174	{
1175	flag zSign;
1176
1177	if ( a == 0 ) return 0;
1178	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179	return packFloat64( 1, 0x43E, 0 );
1180	}
1181	zSign = ( a < 0 );
1182	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184	}
1185
1186	#ifdef FLOATX80
1187
1188	/*----------------------------------------------------------------------------
1189	\| Returns the result of converting the 64-bit two's complement integer `a'
1190	\| to the extended double-precision floating-point format. The conversion
1191	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192	\| Arithmetic.
1193	----------------------------------------------------------------------------/
1194
1195	floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196	{
1197	flag zSign;
1198	uint64 absA;
1199	int8 shiftCount;
1200
1201	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202	zSign = ( a < 0 );
1203	absA = zSign ? - a : a;
1204	shiftCount = countLeadingZeros64( absA );
1205	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207	}
1208
1209	#endif
1210
1211	#ifdef FLOAT128
1212
1213	/*----------------------------------------------------------------------------
1214	\| Returns the result of converting the 64-bit two's complement integer `a' to
1215	\| the quadruple-precision floating-point format. The conversion is performed
1216	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217	----------------------------------------------------------------------------/
1218
1219	float128 int64_to_float128( int64 a STATUS_PARAM )
1220	{
1221	flag zSign;
1222	uint64 absA;
1223	int8 shiftCount;
1224	int32 zExp;
1225	bits64 zSig0, zSig1;
1226
1227	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228	zSign = ( a < 0 );
1229	absA = zSign ? - a : a;
1230	shiftCount = countLeadingZeros64( absA ) + 49;
1231	zExp = 0x406E - shiftCount;
1232	if ( 64 <= shiftCount ) {
1233	zSig1 = 0;
1234	zSig0 = absA;
1235	shiftCount -= 64;
1236	}
1237	else {
1238	zSig1 = absA;
1239	zSig0 = 0;
1240	}
1241	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242	return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244	}
1245
1246	#endif
1247
1248	/*----------------------------------------------------------------------------
1249	\| Returns the result of converting the single-precision floating-point value
1250	\| `a' to the 32-bit two's complement integer format. The conversion is
1251	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252	\| Arithmetic---which means in particular that the conversion is rounded
1253	\| according to the current rounding mode. If `a' is a NaN, the largest
1254	\| positive integer is returned. Otherwise, if the conversion overflows, the
1255	\| largest integer with the same sign as `a' is returned.
1256	----------------------------------------------------------------------------/
1257
1258	int32 float32_to_int32( float32 a STATUS_PARAM )
1259	{
1260	flag aSign;
1261	int16 aExp, shiftCount;
1262	bits32 aSig;
1263	bits64 aSig64;
1264
1265	aSig = extractFloat32Frac( a );
1266	aExp = extractFloat32Exp( a );
1267	aSign = extractFloat32Sign( a );
1268	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269	if ( aExp ) aSig \|= 0x00800000;
1270	shiftCount = 0xAF - aExp;
1271	aSig64 = aSig;
1272	aSig64 <<= 32;
1273	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274	return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276	}
1277
1278	/*----------------------------------------------------------------------------
1279	\| Returns the result of converting the single-precision floating-point value
1280	\| `a' to the 32-bit two's complement integer format. The conversion is
1281	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282	\| Arithmetic, except that the conversion is always rounded toward zero.
1283	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1284	\| the conversion overflows, the largest integer with the same sign as `a' is
1285	\| returned.
1286	----------------------------------------------------------------------------/
1287
1288	int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289	{
1290	flag aSign;
1291	int16 aExp, shiftCount;
1292	bits32 aSig;
1293	int32 z;
1294
1295	aSig = extractFloat32Frac( a );
1296	aExp = extractFloat32Exp( a );
1297	aSign = extractFloat32Sign( a );
1298	shiftCount = aExp - 0x9E;
1299	if ( 0 <= shiftCount ) {
1300	if ( a != 0xCF000000 ) {
1301	float_raise( float_flag_invalid STATUS_VAR);
1302	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303	}
1304	return (sbits32) 0x80000000;
1305	}
1306	else if ( aExp <= 0x7E ) {
1307	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1308	return 0;
1309	}
1310	aSig = ( aSig \| 0x00800000 )<<8;
1311	z = aSig>>( - shiftCount );
1312	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313	STATUS(float_exception_flags) \|= float_flag_inexact;
1314	}
1315	if ( aSign ) z = - z;
1316	return z;
1317
1318	}
1319
1320	/*----------------------------------------------------------------------------
1321	\| Returns the result of converting the single-precision floating-point value
1322	\| `a' to the 64-bit two's complement integer format. The conversion is
1323	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324	\| Arithmetic---which means in particular that the conversion is rounded
1325	\| according to the current rounding mode. If `a' is a NaN, the largest
1326	\| positive integer is returned. Otherwise, if the conversion overflows, the
1327	\| largest integer with the same sign as `a' is returned.
1328	----------------------------------------------------------------------------/
1329
1330	int64 float32_to_int64( float32 a STATUS_PARAM )
1331	{
1332	flag aSign;
1333	int16 aExp, shiftCount;
1334	bits32 aSig;
1335	bits64 aSig64, aSigExtra;
1336
1337	aSig = extractFloat32Frac( a );
1338	aExp = extractFloat32Exp( a );
1339	aSign = extractFloat32Sign( a );
1340	shiftCount = 0xBE - aExp;
1341	if ( shiftCount < 0 ) {
1342	float_raise( float_flag_invalid STATUS_VAR);
1343	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1344	return LIT64( 0x7FFFFFFFFFFFFFFF );
1345	}
1346	return (sbits64) LIT64( 0x8000000000000000 );
1347	}
1348	if ( aExp ) aSig \|= 0x00800000;
1349	aSig64 = aSig;
1350	aSig64 <<= 40;
1351	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352	return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354	}
1355
1356	/*----------------------------------------------------------------------------
1357	\| Returns the result of converting the single-precision floating-point value
1358	\| `a' to the 64-bit two's complement integer format. The conversion is
1359	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360	\| Arithmetic, except that the conversion is always rounded toward zero. If
1361	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1362	\| conversion overflows, the largest integer with the same sign as `a' is
1363	\| returned.
1364	----------------------------------------------------------------------------/
1365
1366	int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367	{
1368	flag aSign;
1369	int16 aExp, shiftCount;
1370	bits32 aSig;
1371	bits64 aSig64;
1372	int64 z;
1373
1374	aSig = extractFloat32Frac( a );
1375	aExp = extractFloat32Exp( a );
1376	aSign = extractFloat32Sign( a );
1377	shiftCount = aExp - 0xBE;
1378	if ( 0 <= shiftCount ) {
1379	if ( a != 0xDF000000 ) {
1380	float_raise( float_flag_invalid STATUS_VAR);
1381	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1382	return LIT64( 0x7FFFFFFFFFFFFFFF );
1383	}
1384	}
1385	return (sbits64) LIT64( 0x8000000000000000 );
1386	}
1387	else if ( aExp <= 0x7E ) {
1388	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1389	return 0;
1390	}
1391	aSig64 = aSig \| 0x00800000;
1392	aSig64 <<= 40;
1393	z = aSig64>>( - shiftCount );
1394	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395	STATUS(float_exception_flags) \|= float_flag_inexact;
1396	}
1397	if ( aSign ) z = - z;
1398	return z;
1399
1400	}
1401
1402	/*----------------------------------------------------------------------------
1403	\| Returns the result of converting the single-precision floating-point value
1404	\| `a' to the double-precision floating-point format. The conversion is
1405	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406	\| Arithmetic.
1407	----------------------------------------------------------------------------/
1408
1409	float64 float32_to_float64( float32 a STATUS_PARAM )
1410	{
1411	flag aSign;
1412	int16 aExp;
1413	bits32 aSig;
1414
1415	aSig = extractFloat32Frac( a );
1416	aExp = extractFloat32Exp( a );
1417	aSign = extractFloat32Sign( a );
1418	if ( aExp == 0xFF ) {
1419	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420	return packFloat64( aSign, 0x7FF, 0 );
1421	}
1422	if ( aExp == 0 ) {
1423	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425	--aExp;
1426	}
1427	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429	}
1430
1431	#ifdef FLOATX80
1432
1433	/*----------------------------------------------------------------------------
1434	\| Returns the result of converting the single-precision floating-point value
1435	\| `a' to the extended double-precision floating-point format. The conversion
1436	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437	\| Arithmetic.
1438	----------------------------------------------------------------------------/
1439
1440	floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441	{
1442	flag aSign;
1443	int16 aExp;
1444	bits32 aSig;
1445
1446	aSig = extractFloat32Frac( a );
1447	aExp = extractFloat32Exp( a );
1448	aSign = extractFloat32Sign( a );
1449	if ( aExp == 0xFF ) {
1450	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452	}
1453	if ( aExp == 0 ) {
1454	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456	}
1457	aSig \|= 0x00800000;
1458	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460	}
1461
1462	#endif
1463
1464	#ifdef FLOAT128
1465
1466	/*----------------------------------------------------------------------------
1467	\| Returns the result of converting the single-precision floating-point value
1468	\| `a' to the double-precision floating-point format. The conversion is
1469	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470	\| Arithmetic.
1471	----------------------------------------------------------------------------/
1472
1473	float128 float32_to_float128( float32 a STATUS_PARAM )
1474	{
1475	flag aSign;
1476	int16 aExp;
1477	bits32 aSig;
1478
1479	aSig = extractFloat32Frac( a );
1480	aExp = extractFloat32Exp( a );
1481	aSign = extractFloat32Sign( a );
1482	if ( aExp == 0xFF ) {
1483	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484	return packFloat128( aSign, 0x7FFF, 0, 0 );
1485	}
1486	if ( aExp == 0 ) {
1487	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489	--aExp;
1490	}
1491	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493	}
1494
1495	#endif
1496
1497	/*----------------------------------------------------------------------------
1498	\| Rounds the single-precision floating-point value `a' to an integer, and
1499	\| returns the result as a single-precision floating-point value. The
1500	\| operation is performed according to the IEC/IEEE Standard for Binary
1501	\| Floating-Point Arithmetic.
1502	----------------------------------------------------------------------------/
1503
1504	float32 float32_round_to_int( float32 a STATUS_PARAM)
1505	{
1506	flag aSign;
1507	int16 aExp;
1508	bits32 lastBitMask, roundBitsMask;
1509	int8 roundingMode;
1510	float32 z;
1511
1512	aExp = extractFloat32Exp( a );
1513	if ( 0x96 <= aExp ) {
1514	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515	return propagateFloat32NaN( a, a STATUS_VAR );
1516	}
1517	return a;
1518	}
1519	if ( aExp <= 0x7E ) {
1520	if ( (bits32) ( a<<1 ) == 0 ) return a;
1521	STATUS(float_exception_flags) \|= float_flag_inexact;
1522	aSign = extractFloat32Sign( a );
1523	switch ( STATUS(float_rounding_mode) ) {
1524	case float_round_nearest_even:
1525	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526	return packFloat32( aSign, 0x7F, 0 );
1527	}
1528	break;
1529	case float_round_down:
1530	return aSign ? 0xBF800000 : 0;
1531	case float_round_up:
1532	return aSign ? 0x80000000 : 0x3F800000;
1533	}
1534	return packFloat32( aSign, 0, 0 );
1535	}
1536	lastBitMask = 1;
1537	lastBitMask <<= 0x96 - aExp;
1538	roundBitsMask = lastBitMask - 1;
1539	z = a;
1540	roundingMode = STATUS(float_rounding_mode);
1541	if ( roundingMode == float_round_nearest_even ) {
1542	z += lastBitMask>>1;
1543	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544	}
1545	else if ( roundingMode != float_round_to_zero ) {
1546	if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547	z += roundBitsMask;
1548	}
1549	}
1550	z &= ~ roundBitsMask;
1551	if ( z != a ) STATUS(float_exception_flags) \|= float_flag_inexact;
1552	return z;
1553
1554	}
1555
1556	/*----------------------------------------------------------------------------
1557	\| Returns the result of adding the absolute values of the single-precision
1558	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1559	\| before being returned. `zSign' is ignored if the result is a NaN.
1560	\| The addition is performed according to the IEC/IEEE Standard for Binary
1561	\| Floating-Point Arithmetic.
1562	----------------------------------------------------------------------------/
1563
1564	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565	{
1566	int16 aExp, bExp, zExp;
1567	bits32 aSig, bSig, zSig;
1568	int16 expDiff;
1569
1570	aSig = extractFloat32Frac( a );
1571	aExp = extractFloat32Exp( a );
1572	bSig = extractFloat32Frac( b );
1573	bExp = extractFloat32Exp( b );
1574	expDiff = aExp - bExp;
1575	aSig <<= 6;
1576	bSig <<= 6;
1577	if ( 0 < expDiff ) {
1578	if ( aExp == 0xFF ) {
1579	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580	return a;
1581	}
1582	if ( bExp == 0 ) {
1583	--expDiff;
1584	}
1585	else {
1586	bSig \|= 0x20000000;
1587	}
1588	shift32RightJamming( bSig, expDiff, &bSig );
1589	zExp = aExp;
1590	}
1591	else if ( expDiff < 0 ) {
1592	if ( bExp == 0xFF ) {
1593	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594	return packFloat32( zSign, 0xFF, 0 );
1595	}
1596	if ( aExp == 0 ) {
1597	++expDiff;
1598	}
1599	else {
1600	aSig \|= 0x20000000;
1601	}
1602	shift32RightJamming( aSig, - expDiff, &aSig );
1603	zExp = bExp;
1604	}
1605	else {
1606	if ( aExp == 0xFF ) {
1607	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608	return a;
1609	}
1610	if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611	zSig = 0x40000000 + aSig + bSig;
1612	zExp = aExp;
1613	goto roundAndPack;
1614	}
1615	aSig \|= 0x20000000;
1616	zSig = ( aSig + bSig )<<1;
1617	--zExp;
1618	if ( (sbits32) zSig < 0 ) {
1619	zSig = aSig + bSig;
1620	++zExp;
1621	}
1622	roundAndPack:
1623	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625	}
1626
1627	/*----------------------------------------------------------------------------
1628	\| Returns the result of subtracting the absolute values of the single-
1629	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
1630	\| difference is negated before being returned. `zSign' is ignored if the
1631	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
1632	\| Standard for Binary Floating-Point Arithmetic.
1633	----------------------------------------------------------------------------/
1634
1635	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636	{
1637	int16 aExp, bExp, zExp;
1638	bits32 aSig, bSig, zSig;
1639	int16 expDiff;
1640
1641	aSig = extractFloat32Frac( a );
1642	aExp = extractFloat32Exp( a );
1643	bSig = extractFloat32Frac( b );
1644	bExp = extractFloat32Exp( b );
1645	expDiff = aExp - bExp;
1646	aSig <<= 7;
1647	bSig <<= 7;
1648	if ( 0 < expDiff ) goto aExpBigger;
1649	if ( expDiff < 0 ) goto bExpBigger;
1650	if ( aExp == 0xFF ) {
1651	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652	float_raise( float_flag_invalid STATUS_VAR);
1653	return float32_default_nan;
1654	}
1655	if ( aExp == 0 ) {
1656	aExp = 1;
1657	bExp = 1;
1658	}
1659	if ( bSig < aSig ) goto aBigger;
1660	if ( aSig < bSig ) goto bBigger;
1661	return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662	bExpBigger:
1663	if ( bExp == 0xFF ) {
1664	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665	return packFloat32( zSign ^ 1, 0xFF, 0 );
1666	}
1667	if ( aExp == 0 ) {
1668	++expDiff;
1669	}
1670	else {
1671	aSig \|= 0x40000000;
1672	}
1673	shift32RightJamming( aSig, - expDiff, &aSig );
1674	bSig \|= 0x40000000;
1675	bBigger:
1676	zSig = bSig - aSig;
1677	zExp = bExp;
1678	zSign ^= 1;
1679	goto normalizeRoundAndPack;
1680	aExpBigger:
1681	if ( aExp == 0xFF ) {
1682	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683	return a;
1684	}
1685	if ( bExp == 0 ) {
1686	--expDiff;
1687	}
1688	else {
1689	bSig \|= 0x40000000;
1690	}
1691	shift32RightJamming( bSig, expDiff, &bSig );
1692	aSig \|= 0x40000000;
1693	aBigger:
1694	zSig = aSig - bSig;
1695	zExp = aExp;
1696	normalizeRoundAndPack:
1697	--zExp;
1698	return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700	}
1701
1702	/*----------------------------------------------------------------------------
1703	\| Returns the result of adding the single-precision floating-point values `a'
1704	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
1705	\| Binary Floating-Point Arithmetic.
1706	----------------------------------------------------------------------------/
1707
1708	float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709	{
1710	flag aSign, bSign;
1711
1712	aSign = extractFloat32Sign( a );
1713	bSign = extractFloat32Sign( b );
1714	if ( aSign == bSign ) {
1715	return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716	}
1717	else {
1718	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719	}
1720
1721	}
1722
1723	/*----------------------------------------------------------------------------
1724	\| Returns the result of subtracting the single-precision floating-point values
1725	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1726	\| for Binary Floating-Point Arithmetic.
1727	----------------------------------------------------------------------------/
1728
1729	float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730	{
1731	flag aSign, bSign;
1732
1733	aSign = extractFloat32Sign( a );
1734	bSign = extractFloat32Sign( b );
1735	if ( aSign == bSign ) {
1736	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737	}
1738	else {
1739	return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740	}
1741
1742	}
1743
1744	/*----------------------------------------------------------------------------
1745	\| Returns the result of multiplying the single-precision floating-point values
1746	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1747	\| for Binary Floating-Point Arithmetic.
1748	----------------------------------------------------------------------------/
1749
1750	float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751	{
1752	flag aSign, bSign, zSign;
1753	int16 aExp, bExp, zExp;
1754	bits32 aSig, bSig;
1755	bits64 zSig64;
1756	bits32 zSig;
1757
1758	aSig = extractFloat32Frac( a );
1759	aExp = extractFloat32Exp( a );
1760	aSign = extractFloat32Sign( a );
1761	bSig = extractFloat32Frac( b );
1762	bExp = extractFloat32Exp( b );
1763	bSign = extractFloat32Sign( b );
1764	zSign = aSign ^ bSign;
1765	if ( aExp == 0xFF ) {
1766	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1767	return propagateFloat32NaN( a, b STATUS_VAR );
1768	}
1769	if ( ( bExp \| bSig ) == 0 ) {
1770	float_raise( float_flag_invalid STATUS_VAR);
1771	return float32_default_nan;
1772	}
1773	return packFloat32( zSign, 0xFF, 0 );
1774	}
1775	if ( bExp == 0xFF ) {
1776	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777	if ( ( aExp \| aSig ) == 0 ) {
1778	float_raise( float_flag_invalid STATUS_VAR);
1779	return float32_default_nan;
1780	}
1781	return packFloat32( zSign, 0xFF, 0 );
1782	}
1783	if ( aExp == 0 ) {
1784	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786	}
1787	if ( bExp == 0 ) {
1788	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790	}
1791	zExp = aExp + bExp - 0x7F;
1792	aSig = ( aSig \| 0x00800000 )<<7;
1793	bSig = ( bSig \| 0x00800000 )<<8;
1794	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795	zSig = zSig64;
1796	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797	zSig <<= 1;
1798	--zExp;
1799	}
1800	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802	}
1803
1804	/*----------------------------------------------------------------------------
1805	\| Returns the result of dividing the single-precision floating-point value `a'
1806	\| by the corresponding value `b'. The operation is performed according to the
1807	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808	----------------------------------------------------------------------------/
1809
1810	float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811	{
1812	flag aSign, bSign, zSign;
1813	int16 aExp, bExp, zExp;
1814	bits32 aSig, bSig, zSig;
1815
1816	aSig = extractFloat32Frac( a );
1817	aExp = extractFloat32Exp( a );
1818	aSign = extractFloat32Sign( a );
1819	bSig = extractFloat32Frac( b );
1820	bExp = extractFloat32Exp( b );
1821	bSign = extractFloat32Sign( b );
1822	zSign = aSign ^ bSign;
1823	if ( aExp == 0xFF ) {
1824	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825	if ( bExp == 0xFF ) {
1826	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827	float_raise( float_flag_invalid STATUS_VAR);
1828	return float32_default_nan;
1829	}
1830	return packFloat32( zSign, 0xFF, 0 );
1831	}
1832	if ( bExp == 0xFF ) {
1833	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834	return packFloat32( zSign, 0, 0 );
1835	}
1836	if ( bExp == 0 ) {
1837	if ( bSig == 0 ) {
1838	if ( ( aExp \| aSig ) == 0 ) {
1839	float_raise( float_flag_invalid STATUS_VAR);
1840	return float32_default_nan;
1841	}
1842	float_raise( float_flag_divbyzero STATUS_VAR);
1843	return packFloat32( zSign, 0xFF, 0 );
1844	}
1845	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846	}
1847	if ( aExp == 0 ) {
1848	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850	}
1851	zExp = aExp - bExp + 0x7D;
1852	aSig = ( aSig \| 0x00800000 )<<7;
1853	bSig = ( bSig \| 0x00800000 )<<8;
1854	if ( bSig <= ( aSig + aSig ) ) {
1855	aSig >>= 1;
1856	++zExp;
1857	}
1858	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859	if ( ( zSig & 0x3F ) == 0 ) {
1860	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861	}
1862	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864	}
1865
1866	/*----------------------------------------------------------------------------
1867	\| Returns the remainder of the single-precision floating-point value `a'
1868	\| with respect to the corresponding value `b'. The operation is performed
1869	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870	----------------------------------------------------------------------------/
1871
1872	float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873	{
1874	flag aSign, bSign, zSign;
1875	int16 aExp, bExp, expDiff;
1876	bits32 aSig, bSig;
1877	bits32 q;
1878	bits64 aSig64, bSig64, q64;
1879	bits32 alternateASig;
1880	sbits32 sigMean;
1881
1882	aSig = extractFloat32Frac( a );
1883	aExp = extractFloat32Exp( a );
1884	aSign = extractFloat32Sign( a );
1885	bSig = extractFloat32Frac( b );
1886	bExp = extractFloat32Exp( b );
1887	bSign = extractFloat32Sign( b );
1888	if ( aExp == 0xFF ) {
1889	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1890	return propagateFloat32NaN( a, b STATUS_VAR );
1891	}
1892	float_raise( float_flag_invalid STATUS_VAR);
1893	return float32_default_nan;
1894	}
1895	if ( bExp == 0xFF ) {
1896	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897	return a;
1898	}
1899	if ( bExp == 0 ) {
1900	if ( bSig == 0 ) {
1901	float_raise( float_flag_invalid STATUS_VAR);
1902	return float32_default_nan;
1903	}
1904	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905	}
1906	if ( aExp == 0 ) {
1907	if ( aSig == 0 ) return a;
1908	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909	}
1910	expDiff = aExp - bExp;
1911	aSig \|= 0x00800000;
1912	bSig \|= 0x00800000;
1913	if ( expDiff < 32 ) {
1914	aSig <<= 8;
1915	bSig <<= 8;
1916	if ( expDiff < 0 ) {
1917	if ( expDiff < -1 ) return a;
1918	aSig >>= 1;
1919	}
1920	q = ( bSig <= aSig );
1921	if ( q ) aSig -= bSig;
1922	if ( 0 < expDiff ) {
1923	q = ( ( (bits64) aSig )<<32 ) / bSig;
1924	q >>= 32 - expDiff;
1925	bSig >>= 2;
1926	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927	}
1928	else {
1929	aSig >>= 2;
1930	bSig >>= 2;
1931	}
1932	}
1933	else {
1934	if ( bSig <= aSig ) aSig -= bSig;
1935	aSig64 = ( (bits64) aSig )<<40;
1936	bSig64 = ( (bits64) bSig )<<40;
1937	expDiff -= 64;
1938	while ( 0 < expDiff ) {
1939	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941	aSig64 = - ( ( bSig * q64 )<<38 );
1942	expDiff -= 62;
1943	}
1944	expDiff += 64;
1945	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947	q = q64>>( 64 - expDiff );
1948	bSig <<= 6;
1949	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950	}
1951	do {
1952	alternateASig = aSig;
1953	++q;
1954	aSig -= bSig;
1955	} while ( 0 <= (sbits32) aSig );
1956	sigMean = aSig + alternateASig;
1957	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958	aSig = alternateASig;
1959	}
1960	zSign = ( (sbits32) aSig < 0 );
1961	if ( zSign ) aSig = - aSig;
1962	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964	}
1965
1966	/*----------------------------------------------------------------------------
1967	\| Returns the square root of the single-precision floating-point value `a'.
1968	\| The operation is performed according to the IEC/IEEE Standard for Binary
1969	\| Floating-Point Arithmetic.
1970	----------------------------------------------------------------------------/
1971
1972	float32 float32_sqrt( float32 a STATUS_PARAM )
1973	{
1974	flag aSign;
1975	int16 aExp, zExp;
1976	bits32 aSig, zSig;
1977	bits64 rem, term;
1978
1979	aSig = extractFloat32Frac( a );
1980	aExp = extractFloat32Exp( a );
1981	aSign = extractFloat32Sign( a );
1982	if ( aExp == 0xFF ) {
1983	if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984	if ( ! aSign ) return a;
1985	float_raise( float_flag_invalid STATUS_VAR);
1986	return float32_default_nan;
1987	}
1988	if ( aSign ) {
1989	if ( ( aExp \| aSig ) == 0 ) return a;
1990	float_raise( float_flag_invalid STATUS_VAR);
1991	return float32_default_nan;
1992	}
1993	if ( aExp == 0 ) {
1994	if ( aSig == 0 ) return 0;
1995	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996	}
1997	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998	aSig = ( aSig \| 0x00800000 )<<8;
1999	zSig = estimateSqrt32( aExp, aSig ) + 2;
2000	if ( ( zSig & 0x7F ) <= 5 ) {
2001	if ( zSig < 2 ) {
2002	zSig = 0x7FFFFFFF;
2003	goto roundAndPack;
2004	}
2005	aSig >>= aExp & 1;
2006	term = ( (bits64) zSig ) * zSig;
2007	rem = ( ( (bits64) aSig )<<32 ) - term;
2008	while ( (sbits64) rem < 0 ) {
2009	--zSig;
2010	rem += ( ( (bits64) zSig )<<1 ) \| 1;
2011	}
2012	zSig \|= ( rem != 0 );
2013	}
2014	shift32RightJamming( zSig, 1, &zSig );
2015	roundAndPack:
2016	return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018	}
2019
2020	/*----------------------------------------------------------------------------
2021	\| Returns 1 if the single-precision floating-point value `a' is equal to
2022	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2023	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024	----------------------------------------------------------------------------/
2025
2026	int float32_eq( float32 a, float32 b STATUS_PARAM )
2027	{
2028
2029	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031	) {
2032	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2033	float_raise( float_flag_invalid STATUS_VAR);
2034	}
2035	return 0;
2036	}
2037	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2038
2039	}
2040
2041	/*----------------------------------------------------------------------------
2042	\| Returns 1 if the single-precision floating-point value `a' is less than
2043	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
2044	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045	\| Arithmetic.
2046	----------------------------------------------------------------------------/
2047
2048	int float32_le( float32 a, float32 b STATUS_PARAM )
2049	{
2050	flag aSign, bSign;
2051
2052	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054	) {
2055	float_raise( float_flag_invalid STATUS_VAR);
2056	return 0;
2057	}
2058	aSign = extractFloat32Sign( a );
2059	bSign = extractFloat32Sign( b );
2060	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2061	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2062
2063	}
2064
2065	/*----------------------------------------------------------------------------
2066	\| Returns 1 if the single-precision floating-point value `a' is less than
2067	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2068	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069	----------------------------------------------------------------------------/
2070
2071	int float32_lt( float32 a, float32 b STATUS_PARAM )
2072	{
2073	flag aSign, bSign;
2074
2075	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077	) {
2078	float_raise( float_flag_invalid STATUS_VAR);
2079	return 0;
2080	}
2081	aSign = extractFloat32Sign( a );
2082	bSign = extractFloat32Sign( b );
2083	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
2084	return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086	}
2087
2088	/*----------------------------------------------------------------------------
2089	\| Returns 1 if the single-precision floating-point value `a' is equal to
2090	\| the corresponding value `b', and 0 otherwise. The invalid exception is
2091	\| raised if either operand is a NaN. Otherwise, the comparison is performed
2092	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093	----------------------------------------------------------------------------/
2094
2095	int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096	{
2097
2098	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100	) {
2101	float_raise( float_flag_invalid STATUS_VAR);
2102	return 0;
2103	}
2104	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2105
2106	}
2107
2108	/*----------------------------------------------------------------------------
2109	\| Returns 1 if the single-precision floating-point value `a' is less than or
2110	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2111	\| cause an exception. Otherwise, the comparison is performed according to the
2112	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113	----------------------------------------------------------------------------/
2114
2115	int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116	{
2117	flag aSign, bSign;
2118
2119	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121	) {
2122	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2123	float_raise( float_flag_invalid STATUS_VAR);
2124	}
2125	return 0;
2126	}
2127	aSign = extractFloat32Sign( a );
2128	bSign = extractFloat32Sign( b );
2129	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
2130	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2131
2132	}
2133
2134	/*----------------------------------------------------------------------------
2135	\| Returns 1 if the single-precision floating-point value `a' is less than
2136	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2137	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2138	\| Standard for Binary Floating-Point Arithmetic.
2139	----------------------------------------------------------------------------/
2140
2141	int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142	{
2143	flag aSign, bSign;
2144
2145	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147	) {
2148	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2149	float_raise( float_flag_invalid STATUS_VAR);
2150	}
2151	return 0;
2152	}
2153	aSign = extractFloat32Sign( a );
2154	bSign = extractFloat32Sign( b );
2155	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
2156	return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158	}
2159
2160	/*----------------------------------------------------------------------------
2161	\| Returns the result of converting the double-precision floating-point value
2162	\| `a' to the 32-bit two's complement integer format. The conversion is
2163	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164	\| Arithmetic---which means in particular that the conversion is rounded
2165	\| according to the current rounding mode. If `a' is a NaN, the largest
2166	\| positive integer is returned. Otherwise, if the conversion overflows, the
2167	\| largest integer with the same sign as `a' is returned.
2168	----------------------------------------------------------------------------/
2169
2170	int32 float64_to_int32( float64 a STATUS_PARAM )
2171	{
2172	flag aSign;
2173	int16 aExp, shiftCount;
2174	bits64 aSig;
2175
2176	aSig = extractFloat64Frac( a );
2177	aExp = extractFloat64Exp( a );
2178	aSign = extractFloat64Sign( a );
2179	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2181	shiftCount = 0x42C - aExp;
2182	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183	return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185	}
2186
2187	/*----------------------------------------------------------------------------
2188	\| Returns the result of converting the double-precision floating-point value
2189	\| `a' to the 32-bit two's complement integer format. The conversion is
2190	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191	\| Arithmetic, except that the conversion is always rounded toward zero.
2192	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2193	\| the conversion overflows, the largest integer with the same sign as `a' is
2194	\| returned.
2195	----------------------------------------------------------------------------/
2196
2197	int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198	{
2199	flag aSign;
2200	int16 aExp, shiftCount;
2201	bits64 aSig, savedASig;
2202	int32 z;
2203
2204	aSig = extractFloat64Frac( a );
2205	aExp = extractFloat64Exp( a );
2206	aSign = extractFloat64Sign( a );
2207	if ( 0x41E < aExp ) {
2208	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209	goto invalid;
2210	}
2211	else if ( aExp < 0x3FF ) {
2212	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2213	return 0;
2214	}
2215	aSig \|= LIT64( 0x0010000000000000 );
2216	shiftCount = 0x433 - aExp;
2217	savedASig = aSig;
2218	aSig >>= shiftCount;
2219	z = aSig;
2220	if ( aSign ) z = - z;
2221	if ( ( z < 0 ) ^ aSign ) {
2222	invalid:
2223	float_raise( float_flag_invalid STATUS_VAR);
2224	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225	}
2226	if ( ( aSig<<shiftCount ) != savedASig ) {
2227	STATUS(float_exception_flags) \|= float_flag_inexact;
2228	}
2229	return z;
2230
2231	}
2232
2233	/*----------------------------------------------------------------------------
2234	\| Returns the result of converting the double-precision floating-point value
2235	\| `a' to the 64-bit two's complement integer format. The conversion is
2236	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237	\| Arithmetic---which means in particular that the conversion is rounded
2238	\| according to the current rounding mode. If `a' is a NaN, the largest
2239	\| positive integer is returned. Otherwise, if the conversion overflows, the
2240	\| largest integer with the same sign as `a' is returned.
2241	----------------------------------------------------------------------------/
2242
2243	int64 float64_to_int64( float64 a STATUS_PARAM )
2244	{
2245	flag aSign;
2246	int16 aExp, shiftCount;
2247	bits64 aSig, aSigExtra;
2248
2249	aSig = extractFloat64Frac( a );
2250	aExp = extractFloat64Exp( a );
2251	aSign = extractFloat64Sign( a );
2252	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2253	shiftCount = 0x433 - aExp;
2254	if ( shiftCount <= 0 ) {
2255	if ( 0x43E < aExp ) {
2256	float_raise( float_flag_invalid STATUS_VAR);
2257	if ( ! aSign
2258	\|\| ( ( aExp == 0x7FF )
2259	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2260	) {
2261	return LIT64( 0x7FFFFFFFFFFFFFFF );
2262	}
2263	return (sbits64) LIT64( 0x8000000000000000 );
2264	}
2265	aSigExtra = 0;
2266	aSig <<= - shiftCount;
2267	}
2268	else {
2269	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270	}
2271	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273	}
2274
2275	/*----------------------------------------------------------------------------
2276	\| Returns the result of converting the double-precision floating-point value
2277	\| `a' to the 64-bit two's complement integer format. The conversion is
2278	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279	\| Arithmetic, except that the conversion is always rounded toward zero.
2280	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2281	\| the conversion overflows, the largest integer with the same sign as `a' is
2282	\| returned.
2283	----------------------------------------------------------------------------/
2284
2285	int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286	{
2287	flag aSign;
2288	int16 aExp, shiftCount;
2289	bits64 aSig;
2290	int64 z;
2291
2292	aSig = extractFloat64Frac( a );
2293	aExp = extractFloat64Exp( a );
2294	aSign = extractFloat64Sign( a );
2295	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2296	shiftCount = aExp - 0x433;
2297	if ( 0 <= shiftCount ) {
2298	if ( 0x43E <= aExp ) {
2299	if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300	float_raise( float_flag_invalid STATUS_VAR);
2301	if ( ! aSign
2302	\|\| ( ( aExp == 0x7FF )
2303	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2304	) {
2305	return LIT64( 0x7FFFFFFFFFFFFFFF );
2306	}
2307	}
2308	return (sbits64) LIT64( 0x8000000000000000 );
2309	}
2310	z = aSig<<shiftCount;
2311	}
2312	else {
2313	if ( aExp < 0x3FE ) {
2314	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2315	return 0;
2316	}
2317	z = aSig>>( - shiftCount );
2318	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319	STATUS(float_exception_flags) \|= float_flag_inexact;
2320	}
2321	}
2322	if ( aSign ) z = - z;
2323	return z;
2324
2325	}
2326
2327	/*----------------------------------------------------------------------------
2328	\| Returns the result of converting the double-precision floating-point value
2329	\| `a' to the single-precision floating-point format. The conversion is
2330	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331	\| Arithmetic.
2332	----------------------------------------------------------------------------/
2333
2334	float32 float64_to_float32( float64 a STATUS_PARAM )
2335	{
2336	flag aSign;
2337	int16 aExp;
2338	bits64 aSig;
2339	bits32 zSig;
2340
2341	aSig = extractFloat64Frac( a );
2342	aExp = extractFloat64Exp( a );
2343	aSign = extractFloat64Sign( a );
2344	if ( aExp == 0x7FF ) {
2345	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346	return packFloat32( aSign, 0xFF, 0 );
2347	}
2348	shift64RightJamming( aSig, 22, &aSig );
2349	zSig = aSig;
2350	if ( aExp \|\| zSig ) {
2351	zSig \|= 0x40000000;
2352	aExp -= 0x381;
2353	}
2354	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356	}
2357
2358	#ifdef FLOATX80
2359
2360	/*----------------------------------------------------------------------------
2361	\| Returns the result of converting the double-precision floating-point value
2362	\| `a' to the extended double-precision floating-point format. The conversion
2363	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364	\| Arithmetic.
2365	----------------------------------------------------------------------------/
2366
2367	floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368	{
2369	flag aSign;
2370	int16 aExp;
2371	bits64 aSig;
2372
2373	aSig = extractFloat64Frac( a );
2374	aExp = extractFloat64Exp( a );
2375	aSign = extractFloat64Sign( a );
2376	if ( aExp == 0x7FF ) {
2377	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379	}
2380	if ( aExp == 0 ) {
2381	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383	}
2384	return
2385	packFloatx80(
2386	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
2387
2388	}
2389
2390	#endif
2391
2392	#ifdef FLOAT128
2393
2394	/*----------------------------------------------------------------------------
2395	\| Returns the result of converting the double-precision floating-point value
2396	\| `a' to the quadruple-precision floating-point format. The conversion is
2397	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398	\| Arithmetic.
2399	----------------------------------------------------------------------------/
2400
2401	float128 float64_to_float128( float64 a STATUS_PARAM )
2402	{
2403	flag aSign;
2404	int16 aExp;
2405	bits64 aSig, zSig0, zSig1;
2406
2407	aSig = extractFloat64Frac( a );
2408	aExp = extractFloat64Exp( a );
2409	aSign = extractFloat64Sign( a );
2410	if ( aExp == 0x7FF ) {
2411	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412	return packFloat128( aSign, 0x7FFF, 0, 0 );
2413	}
2414	if ( aExp == 0 ) {
2415	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417	--aExp;
2418	}
2419	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422	}
2423
2424	#endif
2425
2426	/*----------------------------------------------------------------------------
2427	\| Rounds the double-precision floating-point value `a' to an integer, and
2428	\| returns the result as a double-precision floating-point value. The
2429	\| operation is performed according to the IEC/IEEE Standard for Binary
2430	\| Floating-Point Arithmetic.
2431	----------------------------------------------------------------------------/
2432
2433	float64 float64_round_to_int( float64 a STATUS_PARAM )
2434	{
2435	flag aSign;
2436	int16 aExp;
2437	bits64 lastBitMask, roundBitsMask;
2438	int8 roundingMode;
2439	float64 z;
2440
2441	aExp = extractFloat64Exp( a );
2442	if ( 0x433 <= aExp ) {
2443	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444	return propagateFloat64NaN( a, a STATUS_VAR );
2445	}
2446	return a;
2447	}
2448	if ( aExp < 0x3FF ) {
2449	if ( (bits64) ( a<<1 ) == 0 ) return a;
2450	STATUS(float_exception_flags) \|= float_flag_inexact;
2451	aSign = extractFloat64Sign( a );
2452	switch ( STATUS(float_rounding_mode) ) {
2453	case float_round_nearest_even:
2454	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455	return packFloat64( aSign, 0x3FF, 0 );
2456	}
2457	break;
2458	case float_round_down:
2459	return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460	case float_round_up:
2461	return
2462	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463	}
2464	return packFloat64( aSign, 0, 0 );
2465	}
2466	lastBitMask = 1;
2467	lastBitMask <<= 0x433 - aExp;
2468	roundBitsMask = lastBitMask - 1;
2469	z = a;
2470	roundingMode = STATUS(float_rounding_mode);
2471	if ( roundingMode == float_round_nearest_even ) {
2472	z += lastBitMask>>1;
2473	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474	}
2475	else if ( roundingMode != float_round_to_zero ) {
2476	if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477	z += roundBitsMask;
2478	}
2479	}
2480	z &= ~ roundBitsMask;
2481	if ( z != a ) STATUS(float_exception_flags) \|= float_flag_inexact;
2482	return z;
2483
2484	}
2485
2486	float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2487	{
2488	int oldmode;
2489	float64 res;
2490	oldmode = STATUS(float_rounding_mode);
2491	STATUS(float_rounding_mode) = float_round_to_zero;
2492	res = float64_round_to_int(a STATUS_VAR);
2493	STATUS(float_rounding_mode) = oldmode;
2494	return res;
2495	}
2496
2497	/*----------------------------------------------------------------------------
2498	\| Returns the result of adding the absolute values of the double-precision
2499	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2500	\| before being returned. `zSign' is ignored if the result is a NaN.
2501	\| The addition is performed according to the IEC/IEEE Standard for Binary
2502	\| Floating-Point Arithmetic.
2503	----------------------------------------------------------------------------/
2504
2505	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2506	{
2507	int16 aExp, bExp, zExp;
2508	bits64 aSig, bSig, zSig;
2509	int16 expDiff;
2510
2511	aSig = extractFloat64Frac( a );
2512	aExp = extractFloat64Exp( a );
2513	bSig = extractFloat64Frac( b );
2514	bExp = extractFloat64Exp( b );
2515	expDiff = aExp - bExp;
2516	aSig <<= 9;
2517	bSig <<= 9;
2518	if ( 0 < expDiff ) {
2519	if ( aExp == 0x7FF ) {
2520	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2521	return a;
2522	}
2523	if ( bExp == 0 ) {
2524	--expDiff;
2525	}
2526	else {
2527	bSig \|= LIT64( 0x2000000000000000 );
2528	}
2529	shift64RightJamming( bSig, expDiff, &bSig );
2530	zExp = aExp;
2531	}
2532	else if ( expDiff < 0 ) {
2533	if ( bExp == 0x7FF ) {
2534	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2535	return packFloat64( zSign, 0x7FF, 0 );
2536	}
2537	if ( aExp == 0 ) {
2538	++expDiff;
2539	}
2540	else {
2541	aSig \|= LIT64( 0x2000000000000000 );
2542	}
2543	shift64RightJamming( aSig, - expDiff, &aSig );
2544	zExp = bExp;
2545	}
2546	else {
2547	if ( aExp == 0x7FF ) {
2548	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549	return a;
2550	}
2551	if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2552	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2553	zExp = aExp;
2554	goto roundAndPack;
2555	}
2556	aSig \|= LIT64( 0x2000000000000000 );
2557	zSig = ( aSig + bSig )<<1;
2558	--zExp;
2559	if ( (sbits64) zSig < 0 ) {
2560	zSig = aSig + bSig;
2561	++zExp;
2562	}
2563	roundAndPack:
2564	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2565
2566	}
2567
2568	/*----------------------------------------------------------------------------
2569	\| Returns the result of subtracting the absolute values of the double-
2570	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
2571	\| difference is negated before being returned. `zSign' is ignored if the
2572	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
2573	\| Standard for Binary Floating-Point Arithmetic.
2574	----------------------------------------------------------------------------/
2575
2576	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2577	{
2578	int16 aExp, bExp, zExp;
2579	bits64 aSig, bSig, zSig;
2580	int16 expDiff;
2581
2582	aSig = extractFloat64Frac( a );
2583	aExp = extractFloat64Exp( a );
2584	bSig = extractFloat64Frac( b );
2585	bExp = extractFloat64Exp( b );
2586	expDiff = aExp - bExp;
2587	aSig <<= 10;
2588	bSig <<= 10;
2589	if ( 0 < expDiff ) goto aExpBigger;
2590	if ( expDiff < 0 ) goto bExpBigger;
2591	if ( aExp == 0x7FF ) {
2592	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2593	float_raise( float_flag_invalid STATUS_VAR);
2594	return float64_default_nan;
2595	}
2596	if ( aExp == 0 ) {
2597	aExp = 1;
2598	bExp = 1;
2599	}
2600	if ( bSig < aSig ) goto aBigger;
2601	if ( aSig < bSig ) goto bBigger;
2602	return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2603	bExpBigger:
2604	if ( bExp == 0x7FF ) {
2605	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2606	return packFloat64( zSign ^ 1, 0x7FF, 0 );
2607	}
2608	if ( aExp == 0 ) {
2609	++expDiff;
2610	}
2611	else {
2612	aSig \|= LIT64( 0x4000000000000000 );
2613	}
2614	shift64RightJamming( aSig, - expDiff, &aSig );
2615	bSig \|= LIT64( 0x4000000000000000 );
2616	bBigger:
2617	zSig = bSig - aSig;
2618	zExp = bExp;
2619	zSign ^= 1;
2620	goto normalizeRoundAndPack;
2621	aExpBigger:
2622	if ( aExp == 0x7FF ) {
2623	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624	return a;
2625	}
2626	if ( bExp == 0 ) {
2627	--expDiff;
2628	}
2629	else {
2630	bSig \|= LIT64( 0x4000000000000000 );
2631	}
2632	shift64RightJamming( bSig, expDiff, &bSig );
2633	aSig \|= LIT64( 0x4000000000000000 );
2634	aBigger:
2635	zSig = aSig - bSig;
2636	zExp = aExp;
2637	normalizeRoundAndPack:
2638	--zExp;
2639	return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2640
2641	}
2642
2643	/*----------------------------------------------------------------------------
2644	\| Returns the result of adding the double-precision floating-point values `a'
2645	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
2646	\| Binary Floating-Point Arithmetic.
2647	----------------------------------------------------------------------------/
2648
2649	float64 float64_add( float64 a, float64 b STATUS_PARAM )
2650	{
2651	flag aSign, bSign;
2652
2653	aSign = extractFloat64Sign( a );
2654	bSign = extractFloat64Sign( b );
2655	if ( aSign == bSign ) {
2656	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2657	}
2658	else {
2659	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2660	}
2661
2662	}
2663
2664	/*----------------------------------------------------------------------------
2665	\| Returns the result of subtracting the double-precision floating-point values
2666	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2667	\| for Binary Floating-Point Arithmetic.
2668	----------------------------------------------------------------------------/
2669
2670	float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2671	{
2672	flag aSign, bSign;
2673
2674	aSign = extractFloat64Sign( a );
2675	bSign = extractFloat64Sign( b );
2676	if ( aSign == bSign ) {
2677	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2678	}
2679	else {
2680	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2681	}
2682
2683	}
2684
2685	/*----------------------------------------------------------------------------
2686	\| Returns the result of multiplying the double-precision floating-point values
2687	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2688	\| for Binary Floating-Point Arithmetic.
2689	----------------------------------------------------------------------------/
2690
2691	float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2692	{
2693	flag aSign, bSign, zSign;
2694	int16 aExp, bExp, zExp;
2695	bits64 aSig, bSig, zSig0, zSig1;
2696
2697	aSig = extractFloat64Frac( a );
2698	aExp = extractFloat64Exp( a );
2699	aSign = extractFloat64Sign( a );
2700	bSig = extractFloat64Frac( b );
2701	bExp = extractFloat64Exp( b );
2702	bSign = extractFloat64Sign( b );
2703	zSign = aSign ^ bSign;
2704	if ( aExp == 0x7FF ) {
2705	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2706	return propagateFloat64NaN( a, b STATUS_VAR );
2707	}
2708	if ( ( bExp \| bSig ) == 0 ) {
2709	float_raise( float_flag_invalid STATUS_VAR);
2710	return float64_default_nan;
2711	}
2712	return packFloat64( zSign, 0x7FF, 0 );
2713	}
2714	if ( bExp == 0x7FF ) {
2715	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2716	if ( ( aExp \| aSig ) == 0 ) {
2717	float_raise( float_flag_invalid STATUS_VAR);
2718	return float64_default_nan;
2719	}
2720	return packFloat64( zSign, 0x7FF, 0 );
2721	}
2722	if ( aExp == 0 ) {
2723	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2724	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2725	}
2726	if ( bExp == 0 ) {
2727	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2728	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2729	}
2730	zExp = aExp + bExp - 0x3FF;
2731	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2732	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2733	mul64To128( aSig, bSig, &zSig0, &zSig1 );
2734	zSig0 \|= ( zSig1 != 0 );
2735	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2736	zSig0 <<= 1;
2737	--zExp;
2738	}
2739	return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2740
2741	}
2742
2743	/*----------------------------------------------------------------------------
2744	\| Returns the result of dividing the double-precision floating-point value `a'
2745	\| by the corresponding value `b'. The operation is performed according to
2746	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2747	----------------------------------------------------------------------------/
2748
2749	float64 float64_div( float64 a, float64 b STATUS_PARAM )
2750	{
2751	flag aSign, bSign, zSign;
2752	int16 aExp, bExp, zExp;
2753	bits64 aSig, bSig, zSig;
2754	bits64 rem0, rem1;
2755	bits64 term0, term1;
2756
2757	aSig = extractFloat64Frac( a );
2758	aExp = extractFloat64Exp( a );
2759	aSign = extractFloat64Sign( a );
2760	bSig = extractFloat64Frac( b );
2761	bExp = extractFloat64Exp( b );
2762	bSign = extractFloat64Sign( b );
2763	zSign = aSign ^ bSign;
2764	if ( aExp == 0x7FF ) {
2765	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2766	if ( bExp == 0x7FF ) {
2767	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2768	float_raise( float_flag_invalid STATUS_VAR);
2769	return float64_default_nan;
2770	}
2771	return packFloat64( zSign, 0x7FF, 0 );
2772	}
2773	if ( bExp == 0x7FF ) {
2774	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775	return packFloat64( zSign, 0, 0 );
2776	}
2777	if ( bExp == 0 ) {
2778	if ( bSig == 0 ) {
2779	if ( ( aExp \| aSig ) == 0 ) {
2780	float_raise( float_flag_invalid STATUS_VAR);
2781	return float64_default_nan;
2782	}
2783	float_raise( float_flag_divbyzero STATUS_VAR);
2784	return packFloat64( zSign, 0x7FF, 0 );
2785	}
2786	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2787	}
2788	if ( aExp == 0 ) {
2789	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2790	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2791	}
2792	zExp = aExp - bExp + 0x3FD;
2793	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2794	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2795	if ( bSig <= ( aSig + aSig ) ) {
2796	aSig >>= 1;
2797	++zExp;
2798	}
2799	zSig = estimateDiv128To64( aSig, 0, bSig );
2800	if ( ( zSig & 0x1FF ) <= 2 ) {
2801	mul64To128( bSig, zSig, &term0, &term1 );
2802	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2803	while ( (sbits64) rem0 < 0 ) {
2804	--zSig;
2805	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2806	}
2807	zSig \|= ( rem1 != 0 );
2808	}
2809	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2810
2811	}
2812
2813	/*----------------------------------------------------------------------------
2814	\| Returns the remainder of the double-precision floating-point value `a'
2815	\| with respect to the corresponding value `b'. The operation is performed
2816	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2817	----------------------------------------------------------------------------/
2818
2819	float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2820	{
2821	flag aSign, bSign, zSign;
2822	int16 aExp, bExp, expDiff;
2823	bits64 aSig, bSig;
2824	bits64 q, alternateASig;
2825	sbits64 sigMean;
2826
2827	aSig = extractFloat64Frac( a );
2828	aExp = extractFloat64Exp( a );
2829	aSign = extractFloat64Sign( a );
2830	bSig = extractFloat64Frac( b );
2831	bExp = extractFloat64Exp( b );
2832	bSign = extractFloat64Sign( b );
2833	if ( aExp == 0x7FF ) {
2834	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2835	return propagateFloat64NaN( a, b STATUS_VAR );
2836	}
2837	float_raise( float_flag_invalid STATUS_VAR);
2838	return float64_default_nan;
2839	}
2840	if ( bExp == 0x7FF ) {
2841	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2842	return a;
2843	}
2844	if ( bExp == 0 ) {
2845	if ( bSig == 0 ) {
2846	float_raise( float_flag_invalid STATUS_VAR);
2847	return float64_default_nan;
2848	}
2849	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2850	}
2851	if ( aExp == 0 ) {
2852	if ( aSig == 0 ) return a;
2853	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854	}
2855	expDiff = aExp - bExp;
2856	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
2857	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2858	if ( expDiff < 0 ) {
2859	if ( expDiff < -1 ) return a;
2860	aSig >>= 1;
2861	}
2862	q = ( bSig <= aSig );
2863	if ( q ) aSig -= bSig;
2864	expDiff -= 64;
2865	while ( 0 < expDiff ) {
2866	q = estimateDiv128To64( aSig, 0, bSig );
2867	q = ( 2 < q ) ? q - 2 : 0;
2868	aSig = - ( ( bSig>>2 ) * q );
2869	expDiff -= 62;
2870	}
2871	expDiff += 64;
2872	if ( 0 < expDiff ) {
2873	q = estimateDiv128To64( aSig, 0, bSig );
2874	q = ( 2 < q ) ? q - 2 : 0;
2875	q >>= 64 - expDiff;
2876	bSig >>= 2;
2877	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2878	}
2879	else {
2880	aSig >>= 2;
2881	bSig >>= 2;
2882	}
2883	do {
2884	alternateASig = aSig;
2885	++q;
2886	aSig -= bSig;
2887	} while ( 0 <= (sbits64) aSig );
2888	sigMean = aSig + alternateASig;
2889	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2890	aSig = alternateASig;
2891	}
2892	zSign = ( (sbits64) aSig < 0 );
2893	if ( zSign ) aSig = - aSig;
2894	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2895
2896	}
2897
2898	/*----------------------------------------------------------------------------
2899	\| Returns the square root of the double-precision floating-point value `a'.
2900	\| The operation is performed according to the IEC/IEEE Standard for Binary
2901	\| Floating-Point Arithmetic.
2902	----------------------------------------------------------------------------/
2903
2904	float64 float64_sqrt( float64 a STATUS_PARAM )
2905	{
2906	flag aSign;
2907	int16 aExp, zExp;
2908	bits64 aSig, zSig, doubleZSig;
2909	bits64 rem0, rem1, term0, term1;
2910
2911	aSig = extractFloat64Frac( a );
2912	aExp = extractFloat64Exp( a );
2913	aSign = extractFloat64Sign( a );
2914	if ( aExp == 0x7FF ) {
2915	if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2916	if ( ! aSign ) return a;
2917	float_raise( float_flag_invalid STATUS_VAR);
2918	return float64_default_nan;
2919	}
2920	if ( aSign ) {
2921	if ( ( aExp \| aSig ) == 0 ) return a;
2922	float_raise( float_flag_invalid STATUS_VAR);
2923	return float64_default_nan;
2924	}
2925	if ( aExp == 0 ) {
2926	if ( aSig == 0 ) return 0;
2927	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2928	}
2929	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2930	aSig \|= LIT64( 0x0010000000000000 );
2931	zSig = estimateSqrt32( aExp, aSig>>21 );
2932	aSig <<= 9 - ( aExp & 1 );
2933	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2934	if ( ( zSig & 0x1FF ) <= 5 ) {
2935	doubleZSig = zSig<<1;
2936	mul64To128( zSig, zSig, &term0, &term1 );
2937	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2938	while ( (sbits64) rem0 < 0 ) {
2939	--zSig;
2940	doubleZSig -= 2;
2941	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
2942	}
2943	zSig \|= ( ( rem0 \| rem1 ) != 0 );
2944	}
2945	return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2946
2947	}
2948
2949	/*----------------------------------------------------------------------------
2950	\| Returns 1 if the double-precision floating-point value `a' is equal to the
2951	\| corresponding value `b', and 0 otherwise. The comparison is performed
2952	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2953	----------------------------------------------------------------------------/
2954
2955	int float64_eq( float64 a, float64 b STATUS_PARAM )
2956	{
2957
2958	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2959	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2960	) {
2961	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
2962	float_raise( float_flag_invalid STATUS_VAR);
2963	}
2964	return 0;
2965	}
2966	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
2967
2968	}
2969
2970	/*----------------------------------------------------------------------------
2971	\| Returns 1 if the double-precision floating-point value `a' is less than or
2972	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
2973	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2974	\| Arithmetic.
2975	----------------------------------------------------------------------------/
2976
2977	int float64_le( float64 a, float64 b STATUS_PARAM )
2978	{
2979	flag aSign, bSign;
2980
2981	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2982	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2983	) {
2984	float_raise( float_flag_invalid STATUS_VAR);
2985	return 0;
2986	}
2987	aSign = extractFloat64Sign( a );
2988	bSign = extractFloat64Sign( b );
2989	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
2990	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
2991
2992	}
2993
2994	/*----------------------------------------------------------------------------
2995	\| Returns 1 if the double-precision floating-point value `a' is less than
2996	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2997	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2998	----------------------------------------------------------------------------/
2999
3000	int float64_lt( float64 a, float64 b STATUS_PARAM )
3001	{
3002	flag aSign, bSign;
3003
3004	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3005	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3006	) {
3007	float_raise( float_flag_invalid STATUS_VAR);
3008	return 0;
3009	}
3010	aSign = extractFloat64Sign( a );
3011	bSign = extractFloat64Sign( b );
3012	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
3013	return ( a != b ) && ( aSign ^ ( a < b ) );
3014
3015	}
3016
3017	/*----------------------------------------------------------------------------
3018	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3019	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
3020	\| if either operand is a NaN. Otherwise, the comparison is performed
3021	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3022	----------------------------------------------------------------------------/
3023
3024	int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3025	{
3026
3027	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3028	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3029	) {
3030	float_raise( float_flag_invalid STATUS_VAR);
3031	return 0;
3032	}
3033	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
3034
3035	}
3036
3037	/*----------------------------------------------------------------------------
3038	\| Returns 1 if the double-precision floating-point value `a' is less than or
3039	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3040	\| cause an exception. Otherwise, the comparison is performed according to the
3041	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3042	----------------------------------------------------------------------------/
3043
3044	int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3045	{
3046	flag aSign, bSign;
3047
3048	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3049	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3050	) {
3051	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3052	float_raise( float_flag_invalid STATUS_VAR);
3053	}
3054	return 0;
3055	}
3056	aSign = extractFloat64Sign( a );
3057	bSign = extractFloat64Sign( b );
3058	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
3059	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
3060
3061	}
3062
3063	/*----------------------------------------------------------------------------
3064	\| Returns 1 if the double-precision floating-point value `a' is less than
3065	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3066	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3067	\| Standard for Binary Floating-Point Arithmetic.
3068	----------------------------------------------------------------------------/
3069
3070	int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3071	{
3072	flag aSign, bSign;
3073
3074	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3075	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3076	) {
3077	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3078	float_raise( float_flag_invalid STATUS_VAR);
3079	}
3080	return 0;
3081	}
3082	aSign = extractFloat64Sign( a );
3083	bSign = extractFloat64Sign( b );
3084	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
3085	return ( a != b ) && ( aSign ^ ( a < b ) );
3086
3087	}
3088
3089	#ifdef FLOATX80
3090
3091	/*----------------------------------------------------------------------------
3092	\| Returns the result of converting the extended double-precision floating-
3093	\| point value `a' to the 32-bit two's complement integer format. The
3094	\| conversion is performed according to the IEC/IEEE Standard for Binary
3095	\| Floating-Point Arithmetic---which means in particular that the conversion
3096	\| is rounded according to the current rounding mode. If `a' is a NaN, the
3097	\| largest positive integer is returned. Otherwise, if the conversion
3098	\| overflows, the largest integer with the same sign as `a' is returned.
3099	----------------------------------------------------------------------------/
3100
3101	int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3102	{
3103	flag aSign;
3104	int32 aExp, shiftCount;
3105	bits64 aSig;
3106
3107	aSig = extractFloatx80Frac( a );
3108	aExp = extractFloatx80Exp( a );
3109	aSign = extractFloatx80Sign( a );
3110	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3111	shiftCount = 0x4037 - aExp;
3112	if ( shiftCount <= 0 ) shiftCount = 1;
3113	shift64RightJamming( aSig, shiftCount, &aSig );
3114	return roundAndPackInt32( aSign, aSig STATUS_VAR );
3115
3116	}
3117
3118	/*----------------------------------------------------------------------------
3119	\| Returns the result of converting the extended double-precision floating-
3120	\| point value `a' to the 32-bit two's complement integer format. The
3121	\| conversion is performed according to the IEC/IEEE Standard for Binary
3122	\| Floating-Point Arithmetic, except that the conversion is always rounded
3123	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3124	\| Otherwise, if the conversion overflows, the largest integer with the same
3125	\| sign as `a' is returned.
3126	----------------------------------------------------------------------------/
3127
3128	int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3129	{
3130	flag aSign;
3131	int32 aExp, shiftCount;
3132	bits64 aSig, savedASig;
3133	int32 z;
3134
3135	aSig = extractFloatx80Frac( a );
3136	aExp = extractFloatx80Exp( a );
3137	aSign = extractFloatx80Sign( a );
3138	if ( 0x401E < aExp ) {
3139	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3140	goto invalid;
3141	}
3142	else if ( aExp < 0x3FFF ) {
3143	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3144	return 0;
3145	}
3146	shiftCount = 0x403E - aExp;
3147	savedASig = aSig;
3148	aSig >>= shiftCount;
3149	z = aSig;
3150	if ( aSign ) z = - z;
3151	if ( ( z < 0 ) ^ aSign ) {
3152	invalid:
3153	float_raise( float_flag_invalid STATUS_VAR);
3154	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3155	}
3156	if ( ( aSig<<shiftCount ) != savedASig ) {
3157	STATUS(float_exception_flags) \|= float_flag_inexact;
3158	}
3159	return z;
3160
3161	}
3162
3163	/*----------------------------------------------------------------------------
3164	\| Returns the result of converting the extended double-precision floating-
3165	\| point value `a' to the 64-bit two's complement integer format. The
3166	\| conversion is performed according to the IEC/IEEE Standard for Binary
3167	\| Floating-Point Arithmetic---which means in particular that the conversion
3168	\| is rounded according to the current rounding mode. If `a' is a NaN,
3169	\| the largest positive integer is returned. Otherwise, if the conversion
3170	\| overflows, the largest integer with the same sign as `a' is returned.
3171	----------------------------------------------------------------------------/
3172
3173	int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3174	{
3175	flag aSign;
3176	int32 aExp, shiftCount;
3177	bits64 aSig, aSigExtra;
3178
3179	aSig = extractFloatx80Frac( a );
3180	aExp = extractFloatx80Exp( a );
3181	aSign = extractFloatx80Sign( a );
3182	shiftCount = 0x403E - aExp;
3183	if ( shiftCount <= 0 ) {
3184	if ( shiftCount ) {
3185	float_raise( float_flag_invalid STATUS_VAR);
3186	if ( ! aSign
3187	\|\| ( ( aExp == 0x7FFF )
3188	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3189	) {
3190	return LIT64( 0x7FFFFFFFFFFFFFFF );
3191	}
3192	return (sbits64) LIT64( 0x8000000000000000 );
3193	}
3194	aSigExtra = 0;
3195	}
3196	else {
3197	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3198	}
3199	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3200
3201	}
3202
3203	/*----------------------------------------------------------------------------
3204	\| Returns the result of converting the extended double-precision floating-
3205	\| point value `a' to the 64-bit two's complement integer format. The
3206	\| conversion is performed according to the IEC/IEEE Standard for Binary
3207	\| Floating-Point Arithmetic, except that the conversion is always rounded
3208	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3209	\| Otherwise, if the conversion overflows, the largest integer with the same
3210	\| sign as `a' is returned.
3211	----------------------------------------------------------------------------/
3212
3213	int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3214	{
3215	flag aSign;
3216	int32 aExp, shiftCount;
3217	bits64 aSig;
3218	int64 z;
3219
3220	aSig = extractFloatx80Frac( a );
3221	aExp = extractFloatx80Exp( a );
3222	aSign = extractFloatx80Sign( a );
3223	shiftCount = aExp - 0x403E;
3224	if ( 0 <= shiftCount ) {
3225	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3226	if ( ( a.high != 0xC03E ) \|\| aSig ) {
3227	float_raise( float_flag_invalid STATUS_VAR);
3228	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
3229	return LIT64( 0x7FFFFFFFFFFFFFFF );
3230	}
3231	}
3232	return (sbits64) LIT64( 0x8000000000000000 );
3233	}
3234	else if ( aExp < 0x3FFF ) {
3235	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3236	return 0;
3237	}
3238	z = aSig>>( - shiftCount );
3239	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3240	STATUS(float_exception_flags) \|= float_flag_inexact;
3241	}
3242	if ( aSign ) z = - z;
3243	return z;
3244
3245	}
3246
3247	/*----------------------------------------------------------------------------
3248	\| Returns the result of converting the extended double-precision floating-
3249	\| point value `a' to the single-precision floating-point format. The
3250	\| conversion is performed according to the IEC/IEEE Standard for Binary
3251	\| Floating-Point Arithmetic.
3252	----------------------------------------------------------------------------/
3253
3254	float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3255	{
3256	flag aSign;
3257	int32 aExp;
3258	bits64 aSig;
3259
3260	aSig = extractFloatx80Frac( a );
3261	aExp = extractFloatx80Exp( a );
3262	aSign = extractFloatx80Sign( a );
3263	if ( aExp == 0x7FFF ) {
3264	if ( (bits64) ( aSig<<1 ) ) {
3265	return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3266	}
3267	return packFloat32( aSign, 0xFF, 0 );
3268	}
3269	shift64RightJamming( aSig, 33, &aSig );
3270	if ( aExp \|\| aSig ) aExp -= 0x3F81;
3271	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3272
3273	}
3274
3275	/*----------------------------------------------------------------------------
3276	\| Returns the result of converting the extended double-precision floating-
3277	\| point value `a' to the double-precision floating-point format. The
3278	\| conversion is performed according to the IEC/IEEE Standard for Binary
3279	\| Floating-Point Arithmetic.
3280	----------------------------------------------------------------------------/
3281
3282	float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3283	{
3284	flag aSign;
3285	int32 aExp;
3286	bits64 aSig, zSig;
3287
3288	aSig = extractFloatx80Frac( a );
3289	aExp = extractFloatx80Exp( a );
3290	aSign = extractFloatx80Sign( a );
3291	if ( aExp == 0x7FFF ) {
3292	if ( (bits64) ( aSig<<1 ) ) {
3293	return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3294	}
3295	return packFloat64( aSign, 0x7FF, 0 );
3296	}
3297	shift64RightJamming( aSig, 1, &zSig );
3298	if ( aExp \|\| aSig ) aExp -= 0x3C01;
3299	return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3300
3301	}
3302
3303	#ifdef FLOAT128
3304
3305	/*----------------------------------------------------------------------------
3306	\| Returns the result of converting the extended double-precision floating-
3307	\| point value `a' to the quadruple-precision floating-point format. The
3308	\| conversion is performed according to the IEC/IEEE Standard for Binary
3309	\| Floating-Point Arithmetic.
3310	----------------------------------------------------------------------------/
3311
3312	float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3313	{
3314	flag aSign;
3315	int16 aExp;
3316	bits64 aSig, zSig0, zSig1;
3317
3318	aSig = extractFloatx80Frac( a );
3319	aExp = extractFloatx80Exp( a );
3320	aSign = extractFloatx80Sign( a );
3321	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3322	return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3323	}
3324	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3325	return packFloat128( aSign, aExp, zSig0, zSig1 );
3326
3327	}
3328
3329	#endif
3330
3331	/*----------------------------------------------------------------------------
3332	\| Rounds the extended double-precision floating-point value `a' to an integer,
3333	\| and returns the result as an extended quadruple-precision floating-point
3334	\| value. The operation is performed according to the IEC/IEEE Standard for
3335	\| Binary Floating-Point Arithmetic.
3336	----------------------------------------------------------------------------/
3337
3338	floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3339	{
3340	flag aSign;
3341	int32 aExp;
3342	bits64 lastBitMask, roundBitsMask;
3343	int8 roundingMode;
3344	floatx80 z;
3345
3346	aExp = extractFloatx80Exp( a );
3347	if ( 0x403E <= aExp ) {
3348	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3349	return propagateFloatx80NaN( a, a STATUS_VAR );
3350	}
3351	return a;
3352	}
3353	if ( aExp < 0x3FFF ) {
3354	if ( ( aExp == 0 )
3355	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3356	return a;
3357	}
3358	STATUS(float_exception_flags) \|= float_flag_inexact;
3359	aSign = extractFloatx80Sign( a );
3360	switch ( STATUS(float_rounding_mode) ) {
3361	case float_round_nearest_even:
3362	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3363	) {
3364	return
3365	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366	}
3367	break;
3368	case float_round_down:
3369	return
3370	aSign ?
3371	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3372	: packFloatx80( 0, 0, 0 );
3373	case float_round_up:
3374	return
3375	aSign ? packFloatx80( 1, 0, 0 )
3376	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3377	}
3378	return packFloatx80( aSign, 0, 0 );
3379	}
3380	lastBitMask = 1;
3381	lastBitMask <<= 0x403E - aExp;
3382	roundBitsMask = lastBitMask - 1;
3383	z = a;
3384	roundingMode = STATUS(float_rounding_mode);
3385	if ( roundingMode == float_round_nearest_even ) {
3386	z.low += lastBitMask>>1;
3387	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3388	}
3389	else if ( roundingMode != float_round_to_zero ) {
3390	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3391	z.low += roundBitsMask;
3392	}
3393	}
3394	z.low &= ~ roundBitsMask;
3395	if ( z.low == 0 ) {
3396	++z.high;
3397	z.low = LIT64( 0x8000000000000000 );
3398	}
3399	if ( z.low != a.low ) STATUS(float_exception_flags) \|= float_flag_inexact;
3400	return z;
3401
3402	}
3403
3404	/*----------------------------------------------------------------------------
3405	\| Returns the result of adding the absolute values of the extended double-
3406	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3407	\| negated before being returned. `zSign' is ignored if the result is a NaN.
3408	\| The addition is performed according to the IEC/IEEE Standard for Binary
3409	\| Floating-Point Arithmetic.
3410	----------------------------------------------------------------------------/
3411
3412	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3413	{
3414	int32 aExp, bExp, zExp;
3415	bits64 aSig, bSig, zSig0, zSig1;
3416	int32 expDiff;
3417
3418	aSig = extractFloatx80Frac( a );
3419	aExp = extractFloatx80Exp( a );
3420	bSig = extractFloatx80Frac( b );
3421	bExp = extractFloatx80Exp( b );
3422	expDiff = aExp - bExp;
3423	if ( 0 < expDiff ) {
3424	if ( aExp == 0x7FFF ) {
3425	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3426	return a;
3427	}
3428	if ( bExp == 0 ) --expDiff;
3429	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3430	zExp = aExp;
3431	}
3432	else if ( expDiff < 0 ) {
3433	if ( bExp == 0x7FFF ) {
3434	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3435	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3436	}
3437	if ( aExp == 0 ) ++expDiff;
3438	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3439	zExp = bExp;
3440	}
3441	else {
3442	if ( aExp == 0x7FFF ) {
3443	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3444	return propagateFloatx80NaN( a, b STATUS_VAR );
3445	}
3446	return a;
3447	}
3448	zSig1 = 0;
3449	zSig0 = aSig + bSig;
3450	if ( aExp == 0 ) {
3451	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3452	goto roundAndPack;
3453	}
3454	zExp = aExp;
3455	goto shiftRight1;
3456	}
3457	zSig0 = aSig + bSig;
3458	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3459	shiftRight1:
3460	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3461	zSig0 \|= LIT64( 0x8000000000000000 );
3462	++zExp;
3463	roundAndPack:
3464	return
3465	roundAndPackFloatx80(
3466	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3467
3468	}
3469
3470	/*----------------------------------------------------------------------------
3471	\| Returns the result of subtracting the absolute values of the extended
3472	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3473	\| difference is negated before being returned. `zSign' is ignored if the
3474	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
3475	\| Standard for Binary Floating-Point Arithmetic.
3476	----------------------------------------------------------------------------/
3477
3478	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3479	{
3480	int32 aExp, bExp, zExp;
3481	bits64 aSig, bSig, zSig0, zSig1;
3482	int32 expDiff;
3483	floatx80 z;
3484
3485	aSig = extractFloatx80Frac( a );
3486	aExp = extractFloatx80Exp( a );
3487	bSig = extractFloatx80Frac( b );
3488	bExp = extractFloatx80Exp( b );
3489	expDiff = aExp - bExp;
3490	if ( 0 < expDiff ) goto aExpBigger;
3491	if ( expDiff < 0 ) goto bExpBigger;
3492	if ( aExp == 0x7FFF ) {
3493	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3494	return propagateFloatx80NaN( a, b STATUS_VAR );
3495	}
3496	float_raise( float_flag_invalid STATUS_VAR);
3497	z.low = floatx80_default_nan_low;
3498	z.high = floatx80_default_nan_high;
3499	return z;
3500	}
3501	if ( aExp == 0 ) {
3502	aExp = 1;
3503	bExp = 1;
3504	}
3505	zSig1 = 0;
3506	if ( bSig < aSig ) goto aBigger;
3507	if ( aSig < bSig ) goto bBigger;
3508	return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3509	bExpBigger:
3510	if ( bExp == 0x7FFF ) {
3511	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3513	}
3514	if ( aExp == 0 ) ++expDiff;
3515	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516	bBigger:
3517	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3518	zExp = bExp;
3519	zSign ^= 1;
3520	goto normalizeRoundAndPack;
3521	aExpBigger:
3522	if ( aExp == 0x7FFF ) {
3523	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3524	return a;
3525	}
3526	if ( bExp == 0 ) --expDiff;
3527	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3528	aBigger:
3529	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3530	zExp = aExp;
3531	normalizeRoundAndPack:
3532	return
3533	normalizeRoundAndPackFloatx80(
3534	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3535
3536	}
3537
3538	/*----------------------------------------------------------------------------
3539	\| Returns the result of adding the extended double-precision floating-point
3540	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
3541	\| Standard for Binary Floating-Point Arithmetic.
3542	----------------------------------------------------------------------------/
3543
3544	floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3545	{
3546	flag aSign, bSign;
3547
3548	aSign = extractFloatx80Sign( a );
3549	bSign = extractFloatx80Sign( b );
3550	if ( aSign == bSign ) {
3551	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3552	}
3553	else {
3554	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3555	}
3556
3557	}
3558
3559	/*----------------------------------------------------------------------------
3560	\| Returns the result of subtracting the extended double-precision floating-
3561	\| point values `a' and `b'. The operation is performed according to the
3562	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3563	----------------------------------------------------------------------------/
3564
3565	floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3566	{
3567	flag aSign, bSign;
3568
3569	aSign = extractFloatx80Sign( a );
3570	bSign = extractFloatx80Sign( b );
3571	if ( aSign == bSign ) {
3572	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3573	}
3574	else {
3575	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3576	}
3577
3578	}
3579
3580	/*----------------------------------------------------------------------------
3581	\| Returns the result of multiplying the extended double-precision floating-
3582	\| point values `a' and `b'. The operation is performed according to the
3583	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3584	----------------------------------------------------------------------------/
3585
3586	floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3587	{
3588	flag aSign, bSign, zSign;
3589	int32 aExp, bExp, zExp;
3590	bits64 aSig, bSig, zSig0, zSig1;
3591	floatx80 z;
3592
3593	aSig = extractFloatx80Frac( a );
3594	aExp = extractFloatx80Exp( a );
3595	aSign = extractFloatx80Sign( a );
3596	bSig = extractFloatx80Frac( b );
3597	bExp = extractFloatx80Exp( b );
3598	bSign = extractFloatx80Sign( b );
3599	zSign = aSign ^ bSign;
3600	if ( aExp == 0x7FFF ) {
3601	if ( (bits64) ( aSig<<1 )
3602	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3603	return propagateFloatx80NaN( a, b STATUS_VAR );
3604	}
3605	if ( ( bExp \| bSig ) == 0 ) goto invalid;
3606	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607	}
3608	if ( bExp == 0x7FFF ) {
3609	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3610	if ( ( aExp \| aSig ) == 0 ) {
3611	invalid:
3612	float_raise( float_flag_invalid STATUS_VAR);
3613	z.low = floatx80_default_nan_low;
3614	z.high = floatx80_default_nan_high;
3615	return z;
3616	}
3617	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3618	}
3619	if ( aExp == 0 ) {
3620	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3621	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3622	}
3623	if ( bExp == 0 ) {
3624	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3625	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3626	}
3627	zExp = aExp + bExp - 0x3FFE;
3628	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3629	if ( 0 < (sbits64) zSig0 ) {
3630	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631	--zExp;
3632	}
3633	return
3634	roundAndPackFloatx80(
3635	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3636
3637	}
3638
3639	/*----------------------------------------------------------------------------
3640	\| Returns the result of dividing the extended double-precision floating-point
3641	\| value `a' by the corresponding value `b'. The operation is performed
3642	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3643	----------------------------------------------------------------------------/
3644
3645	floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3646	{
3647	flag aSign, bSign, zSign;
3648	int32 aExp, bExp, zExp;
3649	bits64 aSig, bSig, zSig0, zSig1;
3650	bits64 rem0, rem1, rem2, term0, term1, term2;
3651	floatx80 z;
3652
3653	aSig = extractFloatx80Frac( a );
3654	aExp = extractFloatx80Exp( a );
3655	aSign = extractFloatx80Sign( a );
3656	bSig = extractFloatx80Frac( b );
3657	bExp = extractFloatx80Exp( b );
3658	bSign = extractFloatx80Sign( b );
3659	zSign = aSign ^ bSign;
3660	if ( aExp == 0x7FFF ) {
3661	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3662	if ( bExp == 0x7FFF ) {
3663	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3664	goto invalid;
3665	}
3666	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3667	}
3668	if ( bExp == 0x7FFF ) {
3669	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3670	return packFloatx80( zSign, 0, 0 );
3671	}
3672	if ( bExp == 0 ) {
3673	if ( bSig == 0 ) {
3674	if ( ( aExp \| aSig ) == 0 ) {
3675	invalid:
3676	float_raise( float_flag_invalid STATUS_VAR);
3677	z.low = floatx80_default_nan_low;
3678	z.high = floatx80_default_nan_high;
3679	return z;
3680	}
3681	float_raise( float_flag_divbyzero STATUS_VAR);
3682	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683	}
3684	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3685	}
3686	if ( aExp == 0 ) {
3687	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3688	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3689	}
3690	zExp = aExp - bExp + 0x3FFE;
3691	rem1 = 0;
3692	if ( bSig <= aSig ) {
3693	shift128Right( aSig, 0, 1, &aSig, &rem1 );
3694	++zExp;
3695	}
3696	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3697	mul64To128( bSig, zSig0, &term0, &term1 );
3698	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3699	while ( (sbits64) rem0 < 0 ) {
3700	--zSig0;
3701	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3702	}
3703	zSig1 = estimateDiv128To64( rem1, 0, bSig );
3704	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3705	mul64To128( bSig, zSig1, &term1, &term2 );
3706	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3707	while ( (sbits64) rem1 < 0 ) {
3708	--zSig1;
3709	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3710	}
3711	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
3712	}
3713	return
3714	roundAndPackFloatx80(
3715	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3716
3717	}
3718
3719	/*----------------------------------------------------------------------------
3720	\| Returns the remainder of the extended double-precision floating-point value
3721	\| `a' with respect to the corresponding value `b'. The operation is performed
3722	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723	----------------------------------------------------------------------------/
3724
3725	floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3726	{
3727	flag aSign, bSign, zSign;
3728	int32 aExp, bExp, expDiff;
3729	bits64 aSig0, aSig1, bSig;
3730	bits64 q, term0, term1, alternateASig0, alternateASig1;
3731	floatx80 z;
3732
3733	aSig0 = extractFloatx80Frac( a );
3734	aExp = extractFloatx80Exp( a );
3735	aSign = extractFloatx80Sign( a );
3736	bSig = extractFloatx80Frac( b );
3737	bExp = extractFloatx80Exp( b );
3738	bSign = extractFloatx80Sign( b );
3739	if ( aExp == 0x7FFF ) {
3740	if ( (bits64) ( aSig0<<1 )
3741	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3742	return propagateFloatx80NaN( a, b STATUS_VAR );
3743	}
3744	goto invalid;
3745	}
3746	if ( bExp == 0x7FFF ) {
3747	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3748	return a;
3749	}
3750	if ( bExp == 0 ) {
3751	if ( bSig == 0 ) {
3752	invalid:
3753	float_raise( float_flag_invalid STATUS_VAR);
3754	z.low = floatx80_default_nan_low;
3755	z.high = floatx80_default_nan_high;
3756	return z;
3757	}
3758	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759	}
3760	if ( aExp == 0 ) {
3761	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3762	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3763	}
3764	bSig \|= LIT64( 0x8000000000000000 );
3765	zSign = aSign;
3766	expDiff = aExp - bExp;
3767	aSig1 = 0;
3768	if ( expDiff < 0 ) {
3769	if ( expDiff < -1 ) return a;
3770	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3771	expDiff = 0;
3772	}
3773	q = ( bSig <= aSig0 );
3774	if ( q ) aSig0 -= bSig;
3775	expDiff -= 64;
3776	while ( 0 < expDiff ) {
3777	q = estimateDiv128To64( aSig0, aSig1, bSig );
3778	q = ( 2 < q ) ? q - 2 : 0;
3779	mul64To128( bSig, q, &term0, &term1 );
3780	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3781	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3782	expDiff -= 62;
3783	}
3784	expDiff += 64;
3785	if ( 0 < expDiff ) {
3786	q = estimateDiv128To64( aSig0, aSig1, bSig );
3787	q = ( 2 < q ) ? q - 2 : 0;
3788	q >>= 64 - expDiff;
3789	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3790	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3791	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3792	while ( le128( term0, term1, aSig0, aSig1 ) ) {
3793	++q;
3794	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3795	}
3796	}
3797	else {
3798	term1 = 0;
3799	term0 = bSig;
3800	}
3801	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3802	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3803	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3804	&& ( q & 1 ) )
3805	) {
3806	aSig0 = alternateASig0;
3807	aSig1 = alternateASig1;
3808	zSign = ! zSign;
3809	}
3810	return
3811	normalizeRoundAndPackFloatx80(
3812	80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3813
3814	}
3815
3816	/*----------------------------------------------------------------------------
3817	\| Returns the square root of the extended double-precision floating-point
3818	\| value `a'. The operation is performed according to the IEC/IEEE Standard
3819	\| for Binary Floating-Point Arithmetic.
3820	----------------------------------------------------------------------------/
3821
3822	floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3823	{
3824	flag aSign;
3825	int32 aExp, zExp;
3826	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3827	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3828	floatx80 z;
3829
3830	aSig0 = extractFloatx80Frac( a );
3831	aExp = extractFloatx80Exp( a );
3832	aSign = extractFloatx80Sign( a );
3833	if ( aExp == 0x7FFF ) {
3834	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3835	if ( ! aSign ) return a;
3836	goto invalid;
3837	}
3838	if ( aSign ) {
3839	if ( ( aExp \| aSig0 ) == 0 ) return a;
3840	invalid:
3841	float_raise( float_flag_invalid STATUS_VAR);
3842	z.low = floatx80_default_nan_low;
3843	z.high = floatx80_default_nan_high;
3844	return z;
3845	}
3846	if ( aExp == 0 ) {
3847	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3848	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3849	}
3850	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3851	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3852	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3853	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3854	doubleZSig0 = zSig0<<1;
3855	mul64To128( zSig0, zSig0, &term0, &term1 );
3856	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3857	while ( (sbits64) rem0 < 0 ) {
3858	--zSig0;
3859	doubleZSig0 -= 2;
3860	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
3861	}
3862	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3863	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3864	if ( zSig1 == 0 ) zSig1 = 1;
3865	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3866	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867	mul64To128( zSig1, zSig1, &term2, &term3 );
3868	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3869	while ( (sbits64) rem1 < 0 ) {
3870	--zSig1;
3871	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3872	term3 \|= 1;
3873	term2 \|= doubleZSig0;
3874	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3875	}
3876	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
3877	}
3878	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3879	zSig0 \|= doubleZSig0;
3880	return
3881	roundAndPackFloatx80(
3882	STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3883
3884	}
3885
3886	/*----------------------------------------------------------------------------
3887	\| Returns 1 if the extended double-precision floating-point value `a' is
3888	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3889	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3890	\| Arithmetic.
3891	----------------------------------------------------------------------------/
3892
3893	int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3894	{
3895
3896	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3897	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3898	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3899	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3900	) {
3901	if ( floatx80_is_signaling_nan( a )
3902	\|\| floatx80_is_signaling_nan( b ) ) {
3903	float_raise( float_flag_invalid STATUS_VAR);
3904	}
3905	return 0;
3906	}
3907	return
3908	( a.low == b.low )
3909	&& ( ( a.high == b.high )
3910	\|\| ( ( a.low == 0 )
3911	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
3912	);
3913
3914	}
3915
3916	/*----------------------------------------------------------------------------
3917	\| Returns 1 if the extended double-precision floating-point value `a' is
3918	\| less than or equal to the corresponding value `b', and 0 otherwise. The
3919	\| comparison is performed according to the IEC/IEEE Standard for Binary
3920	\| Floating-Point Arithmetic.
3921	----------------------------------------------------------------------------/
3922
3923	int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3924	{
3925	flag aSign, bSign;
3926
3927	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3928	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3929	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3930	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3931	) {
3932	float_raise( float_flag_invalid STATUS_VAR);
3933	return 0;
3934	}
3935	aSign = extractFloatx80Sign( a );
3936	bSign = extractFloatx80Sign( b );
3937	if ( aSign != bSign ) {
3938	return
3939	aSign
3940	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
3941	== 0 );
3942	}
3943	return
3944	aSign ? le128( b.high, b.low, a.high, a.low )
3945	: le128( a.high, a.low, b.high, b.low );
3946
3947	}
3948
3949	/*----------------------------------------------------------------------------
3950	\| Returns 1 if the extended double-precision floating-point value `a' is
3951	\| less than the corresponding value `b', and 0 otherwise. The comparison
3952	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3953	\| Arithmetic.
3954	----------------------------------------------------------------------------/
3955
3956	int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3957	{
3958	flag aSign, bSign;
3959
3960	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3961	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3962	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3963	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3964	) {
3965	float_raise( float_flag_invalid STATUS_VAR);
3966	return 0;
3967	}
3968	aSign = extractFloatx80Sign( a );
3969	bSign = extractFloatx80Sign( b );
3970	if ( aSign != bSign ) {
3971	return
3972	aSign
3973	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
3974	!= 0 );
3975	}
3976	return
3977	aSign ? lt128( b.high, b.low, a.high, a.low )
3978	: lt128( a.high, a.low, b.high, b.low );
3979
3980	}
3981
3982	/*----------------------------------------------------------------------------
3983	\| Returns 1 if the extended double-precision floating-point value `a' is equal
3984	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
3985	\| raised if either operand is a NaN. Otherwise, the comparison is performed
3986	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3987	----------------------------------------------------------------------------/
3988
3989	int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3990	{
3991
3992	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3993	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3995	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996	) {
3997	float_raise( float_flag_invalid STATUS_VAR);
3998	return 0;
3999	}
4000	return
4001	( a.low == b.low )
4002	&& ( ( a.high == b.high )
4003	\|\| ( ( a.low == 0 )
4004	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4005	);
4006
4007	}
4008
4009	/*----------------------------------------------------------------------------
4010	\| Returns 1 if the extended double-precision floating-point value `a' is less
4011	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4012	\| do not cause an exception. Otherwise, the comparison is performed according
4013	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4014	----------------------------------------------------------------------------/
4015
4016	int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4017	{
4018	flag aSign, bSign;
4019
4020	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4021	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4023	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024	) {
4025	if ( floatx80_is_signaling_nan( a )
4026	\|\| floatx80_is_signaling_nan( b ) ) {
4027	float_raise( float_flag_invalid STATUS_VAR);
4028	}
4029	return 0;
4030	}
4031	aSign = extractFloatx80Sign( a );
4032	bSign = extractFloatx80Sign( b );
4033	if ( aSign != bSign ) {
4034	return
4035	aSign
4036	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4037	== 0 );
4038	}
4039	return
4040	aSign ? le128( b.high, b.low, a.high, a.low )
4041	: le128( a.high, a.low, b.high, b.low );
4042
4043	}
4044
4045	/*----------------------------------------------------------------------------
4046	\| Returns 1 if the extended double-precision floating-point value `a' is less
4047	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4048	\| an exception. Otherwise, the comparison is performed according to the
4049	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4050	----------------------------------------------------------------------------/
4051
4052	int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4053	{
4054	flag aSign, bSign;
4055
4056	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4057	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4058	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4059	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4060	) {
4061	if ( floatx80_is_signaling_nan( a )
4062	\|\| floatx80_is_signaling_nan( b ) ) {
4063	float_raise( float_flag_invalid STATUS_VAR);
4064	}
4065	return 0;
4066	}
4067	aSign = extractFloatx80Sign( a );
4068	bSign = extractFloatx80Sign( b );
4069	if ( aSign != bSign ) {
4070	return
4071	aSign
4072	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4073	!= 0 );
4074	}
4075	return
4076	aSign ? lt128( b.high, b.low, a.high, a.low )
4077	: lt128( a.high, a.low, b.high, b.low );
4078
4079	}
4080
4081	#endif
4082
4083	#ifdef FLOAT128
4084
4085	/*----------------------------------------------------------------------------
4086	\| Returns the result of converting the quadruple-precision floating-point
4087	\| value `a' to the 32-bit two's complement integer format. The conversion
4088	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4089	\| Arithmetic---which means in particular that the conversion is rounded
4090	\| according to the current rounding mode. If `a' is a NaN, the largest
4091	\| positive integer is returned. Otherwise, if the conversion overflows, the
4092	\| largest integer with the same sign as `a' is returned.
4093	----------------------------------------------------------------------------/
4094
4095	int32 float128_to_int32( float128 a STATUS_PARAM )
4096	{
4097	flag aSign;
4098	int32 aExp, shiftCount;
4099	bits64 aSig0, aSig1;
4100
4101	aSig1 = extractFloat128Frac1( a );
4102	aSig0 = extractFloat128Frac0( a );
4103	aExp = extractFloat128Exp( a );
4104	aSign = extractFloat128Sign( a );
4105	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
4106	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4107	aSig0 \|= ( aSig1 != 0 );
4108	shiftCount = 0x4028 - aExp;
4109	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4110	return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4111
4112	}
4113
4114	/*----------------------------------------------------------------------------
4115	\| Returns the result of converting the quadruple-precision floating-point
4116	\| value `a' to the 32-bit two's complement integer format. The conversion
4117	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118	\| Arithmetic, except that the conversion is always rounded toward zero. If
4119	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4120	\| conversion overflows, the largest integer with the same sign as `a' is
4121	\| returned.
4122	----------------------------------------------------------------------------/
4123
4124	int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4125	{
4126	flag aSign;
4127	int32 aExp, shiftCount;
4128	bits64 aSig0, aSig1, savedASig;
4129	int32 z;
4130
4131	aSig1 = extractFloat128Frac1( a );
4132	aSig0 = extractFloat128Frac0( a );
4133	aExp = extractFloat128Exp( a );
4134	aSign = extractFloat128Sign( a );
4135	aSig0 \|= ( aSig1 != 0 );
4136	if ( 0x401E < aExp ) {
4137	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4138	goto invalid;
4139	}
4140	else if ( aExp < 0x3FFF ) {
4141	if ( aExp \|\| aSig0 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4142	return 0;
4143	}
4144	aSig0 \|= LIT64( 0x0001000000000000 );
4145	shiftCount = 0x402F - aExp;
4146	savedASig = aSig0;
4147	aSig0 >>= shiftCount;
4148	z = aSig0;
4149	if ( aSign ) z = - z;
4150	if ( ( z < 0 ) ^ aSign ) {
4151	invalid:
4152	float_raise( float_flag_invalid STATUS_VAR);
4153	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4154	}
4155	if ( ( aSig0<<shiftCount ) != savedASig ) {
4156	STATUS(float_exception_flags) \|= float_flag_inexact;
4157	}
4158	return z;
4159
4160	}
4161
4162	/*----------------------------------------------------------------------------
4163	\| Returns the result of converting the quadruple-precision floating-point
4164	\| value `a' to the 64-bit two's complement integer format. The conversion
4165	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166	\| Arithmetic---which means in particular that the conversion is rounded
4167	\| according to the current rounding mode. If `a' is a NaN, the largest
4168	\| positive integer is returned. Otherwise, if the conversion overflows, the
4169	\| largest integer with the same sign as `a' is returned.
4170	----------------------------------------------------------------------------/
4171
4172	int64 float128_to_int64( float128 a STATUS_PARAM )
4173	{
4174	flag aSign;
4175	int32 aExp, shiftCount;
4176	bits64 aSig0, aSig1;
4177
4178	aSig1 = extractFloat128Frac1( a );
4179	aSig0 = extractFloat128Frac0( a );
4180	aExp = extractFloat128Exp( a );
4181	aSign = extractFloat128Sign( a );
4182	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4183	shiftCount = 0x402F - aExp;
4184	if ( shiftCount <= 0 ) {
4185	if ( 0x403E < aExp ) {
4186	float_raise( float_flag_invalid STATUS_VAR);
4187	if ( ! aSign
4188	\|\| ( ( aExp == 0x7FFF )
4189	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4190	)
4191	) {
4192	return LIT64( 0x7FFFFFFFFFFFFFFF );
4193	}
4194	return (sbits64) LIT64( 0x8000000000000000 );
4195	}
4196	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4197	}
4198	else {
4199	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4200	}
4201	return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4202
4203	}
4204
4205	/*----------------------------------------------------------------------------
4206	\| Returns the result of converting the quadruple-precision floating-point
4207	\| value `a' to the 64-bit two's complement integer format. The conversion
4208	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4209	\| Arithmetic, except that the conversion is always rounded toward zero.
4210	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4211	\| the conversion overflows, the largest integer with the same sign as `a' is
4212	\| returned.
4213	----------------------------------------------------------------------------/
4214
4215	int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4216	{
4217	flag aSign;
4218	int32 aExp, shiftCount;
4219	bits64 aSig0, aSig1;
4220	int64 z;
4221
4222	aSig1 = extractFloat128Frac1( a );
4223	aSig0 = extractFloat128Frac0( a );
4224	aExp = extractFloat128Exp( a );
4225	aSign = extractFloat128Sign( a );
4226	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4227	shiftCount = aExp - 0x402F;
4228	if ( 0 < shiftCount ) {
4229	if ( 0x403E <= aExp ) {
4230	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4231	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4232	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4233	if ( aSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4234	}
4235	else {
4236	float_raise( float_flag_invalid STATUS_VAR);
4237	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
4238	return LIT64( 0x7FFFFFFFFFFFFFFF );
4239	}
4240	}
4241	return (sbits64) LIT64( 0x8000000000000000 );
4242	}
4243	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
4244	if ( (bits64) ( aSig1<<shiftCount ) ) {
4245	STATUS(float_exception_flags) \|= float_flag_inexact;
4246	}
4247	}
4248	else {
4249	if ( aExp < 0x3FFF ) {
4250	if ( aExp \| aSig0 \| aSig1 ) {
4251	STATUS(float_exception_flags) \|= float_flag_inexact;
4252	}
4253	return 0;
4254	}
4255	z = aSig0>>( - shiftCount );
4256	if ( aSig1
4257	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4258	STATUS(float_exception_flags) \|= float_flag_inexact;
4259	}
4260	}
4261	if ( aSign ) z = - z;
4262	return z;
4263
4264	}
4265
4266	/*----------------------------------------------------------------------------
4267	\| Returns the result of converting the quadruple-precision floating-point
4268	\| value `a' to the single-precision floating-point format. The conversion
4269	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4270	\| Arithmetic.
4271	----------------------------------------------------------------------------/
4272
4273	float32 float128_to_float32( float128 a STATUS_PARAM )
4274	{
4275	flag aSign;
4276	int32 aExp;
4277	bits64 aSig0, aSig1;
4278	bits32 zSig;
4279
4280	aSig1 = extractFloat128Frac1( a );
4281	aSig0 = extractFloat128Frac0( a );
4282	aExp = extractFloat128Exp( a );
4283	aSign = extractFloat128Sign( a );
4284	if ( aExp == 0x7FFF ) {
4285	if ( aSig0 \| aSig1 ) {
4286	return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4287	}
4288	return packFloat32( aSign, 0xFF, 0 );
4289	}
4290	aSig0 \|= ( aSig1 != 0 );
4291	shift64RightJamming( aSig0, 18, &aSig0 );
4292	zSig = aSig0;
4293	if ( aExp \|\| zSig ) {
4294	zSig \|= 0x40000000;
4295	aExp -= 0x3F81;
4296	}
4297	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4298
4299	}
4300
4301	/*----------------------------------------------------------------------------
4302	\| Returns the result of converting the quadruple-precision floating-point
4303	\| value `a' to the double-precision floating-point format. The conversion
4304	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305	\| Arithmetic.
4306	----------------------------------------------------------------------------/
4307
4308	float64 float128_to_float64( float128 a STATUS_PARAM )
4309	{
4310	flag aSign;
4311	int32 aExp;
4312	bits64 aSig0, aSig1;
4313
4314	aSig1 = extractFloat128Frac1( a );
4315	aSig0 = extractFloat128Frac0( a );
4316	aExp = extractFloat128Exp( a );
4317	aSign = extractFloat128Sign( a );
4318	if ( aExp == 0x7FFF ) {
4319	if ( aSig0 \| aSig1 ) {
4320	return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4321	}
4322	return packFloat64( aSign, 0x7FF, 0 );
4323	}
4324	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4325	aSig0 \|= ( aSig1 != 0 );
4326	if ( aExp \|\| aSig0 ) {
4327	aSig0 \|= LIT64( 0x4000000000000000 );
4328	aExp -= 0x3C01;
4329	}
4330	return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4331
4332	}
4333
4334	#ifdef FLOATX80
4335
4336	/*----------------------------------------------------------------------------
4337	\| Returns the result of converting the quadruple-precision floating-point
4338	\| value `a' to the extended double-precision floating-point format. The
4339	\| conversion is performed according to the IEC/IEEE Standard for Binary
4340	\| Floating-Point Arithmetic.
4341	----------------------------------------------------------------------------/
4342
4343	floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4344	{
4345	flag aSign;
4346	int32 aExp;
4347	bits64 aSig0, aSig1;
4348
4349	aSig1 = extractFloat128Frac1( a );
4350	aSig0 = extractFloat128Frac0( a );
4351	aExp = extractFloat128Exp( a );
4352	aSign = extractFloat128Sign( a );
4353	if ( aExp == 0x7FFF ) {
4354	if ( aSig0 \| aSig1 ) {
4355	return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4356	}
4357	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4358	}
4359	if ( aExp == 0 ) {
4360	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4361	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4362	}
4363	else {
4364	aSig0 \|= LIT64( 0x0001000000000000 );
4365	}
4366	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4367	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4368
4369	}
4370
4371	#endif
4372
4373	/*----------------------------------------------------------------------------
4374	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
4375	\| returns the result as a quadruple-precision floating-point value. The
4376	\| operation is performed according to the IEC/IEEE Standard for Binary
4377	\| Floating-Point Arithmetic.
4378	----------------------------------------------------------------------------/
4379
4380	float128 float128_round_to_int( float128 a STATUS_PARAM )
4381	{
4382	flag aSign;
4383	int32 aExp;
4384	bits64 lastBitMask, roundBitsMask;
4385	int8 roundingMode;
4386	float128 z;
4387
4388	aExp = extractFloat128Exp( a );
4389	if ( 0x402F <= aExp ) {
4390	if ( 0x406F <= aExp ) {
4391	if ( ( aExp == 0x7FFF )
4392	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
4393	) {
4394	return propagateFloat128NaN( a, a STATUS_VAR );
4395	}
4396	return a;
4397	}
4398	lastBitMask = 1;
4399	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4400	roundBitsMask = lastBitMask - 1;
4401	z = a;
4402	roundingMode = STATUS(float_rounding_mode);
4403	if ( roundingMode == float_round_nearest_even ) {
4404	if ( lastBitMask ) {
4405	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4406	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4407	}
4408	else {
4409	if ( (sbits64) z.low < 0 ) {
4410	++z.high;
4411	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4412	}
4413	}
4414	}
4415	else if ( roundingMode != float_round_to_zero ) {
4416	if ( extractFloat128Sign( z )
4417	^ ( roundingMode == float_round_up ) ) {
4418	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4419	}
4420	}
4421	z.low &= ~ roundBitsMask;
4422	}
4423	else {
4424	if ( aExp < 0x3FFF ) {
4425	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
4426	STATUS(float_exception_flags) \|= float_flag_inexact;
4427	aSign = extractFloat128Sign( a );
4428	switch ( STATUS(float_rounding_mode) ) {
4429	case float_round_nearest_even:
4430	if ( ( aExp == 0x3FFE )
4431	&& ( extractFloat128Frac0( a )
4432	\| extractFloat128Frac1( a ) )
4433	) {
4434	return packFloat128( aSign, 0x3FFF, 0, 0 );
4435	}
4436	break;
4437	case float_round_down:
4438	return
4439	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4440	: packFloat128( 0, 0, 0, 0 );
4441	case float_round_up:
4442	return
4443	aSign ? packFloat128( 1, 0, 0, 0 )
4444	: packFloat128( 0, 0x3FFF, 0, 0 );
4445	}
4446	return packFloat128( aSign, 0, 0, 0 );
4447	}
4448	lastBitMask = 1;
4449	lastBitMask <<= 0x402F - aExp;
4450	roundBitsMask = lastBitMask - 1;
4451	z.low = 0;
4452	z.high = a.high;
4453	roundingMode = STATUS(float_rounding_mode);
4454	if ( roundingMode == float_round_nearest_even ) {
4455	z.high += lastBitMask>>1;
4456	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
4457	z.high &= ~ lastBitMask;
4458	}
4459	}
4460	else if ( roundingMode != float_round_to_zero ) {
4461	if ( extractFloat128Sign( z )
4462	^ ( roundingMode == float_round_up ) ) {
4463	z.high \|= ( a.low != 0 );
4464	z.high += roundBitsMask;
4465	}
4466	}
4467	z.high &= ~ roundBitsMask;
4468	}
4469	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
4470	STATUS(float_exception_flags) \|= float_flag_inexact;
4471	}
4472	return z;
4473
4474	}
4475
4476	/*----------------------------------------------------------------------------
4477	\| Returns the result of adding the absolute values of the quadruple-precision
4478	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4479	\| before being returned. `zSign' is ignored if the result is a NaN.
4480	\| The addition is performed according to the IEC/IEEE Standard for Binary
4481	\| Floating-Point Arithmetic.
4482	----------------------------------------------------------------------------/
4483
4484	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4485	{
4486	int32 aExp, bExp, zExp;
4487	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4488	int32 expDiff;
4489
4490	aSig1 = extractFloat128Frac1( a );
4491	aSig0 = extractFloat128Frac0( a );
4492	aExp = extractFloat128Exp( a );
4493	bSig1 = extractFloat128Frac1( b );
4494	bSig0 = extractFloat128Frac0( b );
4495	bExp = extractFloat128Exp( b );
4496	expDiff = aExp - bExp;
4497	if ( 0 < expDiff ) {
4498	if ( aExp == 0x7FFF ) {
4499	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4500	return a;
4501	}
4502	if ( bExp == 0 ) {
4503	--expDiff;
4504	}
4505	else {
4506	bSig0 \|= LIT64( 0x0001000000000000 );
4507	}
4508	shift128ExtraRightJamming(
4509	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4510	zExp = aExp;
4511	}
4512	else if ( expDiff < 0 ) {
4513	if ( bExp == 0x7FFF ) {
4514	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4515	return packFloat128( zSign, 0x7FFF, 0, 0 );
4516	}
4517	if ( aExp == 0 ) {
4518	++expDiff;
4519	}
4520	else {
4521	aSig0 \|= LIT64( 0x0001000000000000 );
4522	}
4523	shift128ExtraRightJamming(
4524	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4525	zExp = bExp;
4526	}
4527	else {
4528	if ( aExp == 0x7FFF ) {
4529	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4530	return propagateFloat128NaN( a, b STATUS_VAR );
4531	}
4532	return a;
4533	}
4534	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4535	if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4536	zSig2 = 0;
4537	zSig0 \|= LIT64( 0x0002000000000000 );
4538	zExp = aExp;
4539	goto shiftRight1;
4540	}
4541	aSig0 \|= LIT64( 0x0001000000000000 );
4542	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4543	--zExp;
4544	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4545	++zExp;
4546	shiftRight1:
4547	shift128ExtraRightJamming(
4548	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4549	roundAndPack:
4550	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4551
4552	}
4553
4554	/*----------------------------------------------------------------------------
4555	\| Returns the result of subtracting the absolute values of the quadruple-
4556	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
4557	\| difference is negated before being returned. `zSign' is ignored if the
4558	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
4559	\| Standard for Binary Floating-Point Arithmetic.
4560	----------------------------------------------------------------------------/
4561
4562	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4563	{
4564	int32 aExp, bExp, zExp;
4565	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4566	int32 expDiff;
4567	float128 z;
4568
4569	aSig1 = extractFloat128Frac1( a );
4570	aSig0 = extractFloat128Frac0( a );
4571	aExp = extractFloat128Exp( a );
4572	bSig1 = extractFloat128Frac1( b );
4573	bSig0 = extractFloat128Frac0( b );
4574	bExp = extractFloat128Exp( b );
4575	expDiff = aExp - bExp;
4576	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4577	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4578	if ( 0 < expDiff ) goto aExpBigger;
4579	if ( expDiff < 0 ) goto bExpBigger;
4580	if ( aExp == 0x7FFF ) {
4581	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4582	return propagateFloat128NaN( a, b STATUS_VAR );
4583	}
4584	float_raise( float_flag_invalid STATUS_VAR);
4585	z.low = float128_default_nan_low;
4586	z.high = float128_default_nan_high;
4587	return z;
4588	}
4589	if ( aExp == 0 ) {
4590	aExp = 1;
4591	bExp = 1;
4592	}
4593	if ( bSig0 < aSig0 ) goto aBigger;
4594	if ( aSig0 < bSig0 ) goto bBigger;
4595	if ( bSig1 < aSig1 ) goto aBigger;
4596	if ( aSig1 < bSig1 ) goto bBigger;
4597	return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4598	bExpBigger:
4599	if ( bExp == 0x7FFF ) {
4600	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4601	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4602	}
4603	if ( aExp == 0 ) {
4604	++expDiff;
4605	}
4606	else {
4607	aSig0 \|= LIT64( 0x4000000000000000 );
4608	}
4609	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4610	bSig0 \|= LIT64( 0x4000000000000000 );
4611	bBigger:
4612	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4613	zExp = bExp;
4614	zSign ^= 1;
4615	goto normalizeRoundAndPack;
4616	aExpBigger:
4617	if ( aExp == 0x7FFF ) {
4618	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4619	return a;
4620	}
4621	if ( bExp == 0 ) {
4622	--expDiff;
4623	}
4624	else {
4625	bSig0 \|= LIT64( 0x4000000000000000 );
4626	}
4627	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4628	aSig0 \|= LIT64( 0x4000000000000000 );
4629	aBigger:
4630	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4631	zExp = aExp;
4632	normalizeRoundAndPack:
4633	--zExp;
4634	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4635
4636	}
4637
4638	/*----------------------------------------------------------------------------
4639	\| Returns the result of adding the quadruple-precision floating-point values
4640	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4641	\| for Binary Floating-Point Arithmetic.
4642	----------------------------------------------------------------------------/
4643
4644	float128 float128_add( float128 a, float128 b STATUS_PARAM )
4645	{
4646	flag aSign, bSign;
4647
4648	aSign = extractFloat128Sign( a );
4649	bSign = extractFloat128Sign( b );
4650	if ( aSign == bSign ) {
4651	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4652	}
4653	else {
4654	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4655	}
4656
4657	}
4658
4659	/*----------------------------------------------------------------------------
4660	\| Returns the result of subtracting the quadruple-precision floating-point
4661	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4662	\| Standard for Binary Floating-Point Arithmetic.
4663	----------------------------------------------------------------------------/
4664
4665	float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4666	{
4667	flag aSign, bSign;
4668
4669	aSign = extractFloat128Sign( a );
4670	bSign = extractFloat128Sign( b );
4671	if ( aSign == bSign ) {
4672	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4673	}
4674	else {
4675	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4676	}
4677
4678	}
4679
4680	/*----------------------------------------------------------------------------
4681	\| Returns the result of multiplying the quadruple-precision floating-point
4682	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4683	\| Standard for Binary Floating-Point Arithmetic.
4684	----------------------------------------------------------------------------/
4685
4686	float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4687	{
4688	flag aSign, bSign, zSign;
4689	int32 aExp, bExp, zExp;
4690	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4691	float128 z;
4692
4693	aSig1 = extractFloat128Frac1( a );
4694	aSig0 = extractFloat128Frac0( a );
4695	aExp = extractFloat128Exp( a );
4696	aSign = extractFloat128Sign( a );
4697	bSig1 = extractFloat128Frac1( b );
4698	bSig0 = extractFloat128Frac0( b );
4699	bExp = extractFloat128Exp( b );
4700	bSign = extractFloat128Sign( b );
4701	zSign = aSign ^ bSign;
4702	if ( aExp == 0x7FFF ) {
4703	if ( ( aSig0 \| aSig1 )
4704	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4705	return propagateFloat128NaN( a, b STATUS_VAR );
4706	}
4707	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
4708	return packFloat128( zSign, 0x7FFF, 0, 0 );
4709	}
4710	if ( bExp == 0x7FFF ) {
4711	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4712	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4713	invalid:
4714	float_raise( float_flag_invalid STATUS_VAR);
4715	z.low = float128_default_nan_low;
4716	z.high = float128_default_nan_high;
4717	return z;
4718	}
4719	return packFloat128( zSign, 0x7FFF, 0, 0 );
4720	}
4721	if ( aExp == 0 ) {
4722	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4723	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4724	}
4725	if ( bExp == 0 ) {
4726	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4727	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4728	}
4729	zExp = aExp + bExp - 0x4000;
4730	aSig0 \|= LIT64( 0x0001000000000000 );
4731	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4732	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4733	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4734	zSig2 \|= ( zSig3 != 0 );
4735	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4736	shift128ExtraRightJamming(
4737	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4738	++zExp;
4739	}
4740	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4741
4742	}
4743
4744	/*----------------------------------------------------------------------------
4745	\| Returns the result of dividing the quadruple-precision floating-point value
4746	\| `a' by the corresponding value `b'. The operation is performed according to
4747	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4748	----------------------------------------------------------------------------/
4749
4750	float128 float128_div( float128 a, float128 b STATUS_PARAM )
4751	{
4752	flag aSign, bSign, zSign;
4753	int32 aExp, bExp, zExp;
4754	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4756	float128 z;
4757
4758	aSig1 = extractFloat128Frac1( a );
4759	aSig0 = extractFloat128Frac0( a );
4760	aExp = extractFloat128Exp( a );
4761	aSign = extractFloat128Sign( a );
4762	bSig1 = extractFloat128Frac1( b );
4763	bSig0 = extractFloat128Frac0( b );
4764	bExp = extractFloat128Exp( b );
4765	bSign = extractFloat128Sign( b );
4766	zSign = aSign ^ bSign;
4767	if ( aExp == 0x7FFF ) {
4768	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4769	if ( bExp == 0x7FFF ) {
4770	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4771	goto invalid;
4772	}
4773	return packFloat128( zSign, 0x7FFF, 0, 0 );
4774	}
4775	if ( bExp == 0x7FFF ) {
4776	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777	return packFloat128( zSign, 0, 0, 0 );
4778	}
4779	if ( bExp == 0 ) {
4780	if ( ( bSig0 \| bSig1 ) == 0 ) {
4781	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4782	invalid:
4783	float_raise( float_flag_invalid STATUS_VAR);
4784	z.low = float128_default_nan_low;
4785	z.high = float128_default_nan_high;
4786	return z;
4787	}
4788	float_raise( float_flag_divbyzero STATUS_VAR);
4789	return packFloat128( zSign, 0x7FFF, 0, 0 );
4790	}
4791	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4792	}
4793	if ( aExp == 0 ) {
4794	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4795	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4796	}
4797	zExp = aExp - bExp + 0x3FFD;
4798	shortShift128Left(
4799	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4800	shortShift128Left(
4801	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4802	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4803	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4804	++zExp;
4805	}
4806	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4807	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4808	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4809	while ( (sbits64) rem0 < 0 ) {
4810	--zSig0;
4811	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4812	}
4813	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4814	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4815	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4816	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4817	while ( (sbits64) rem1 < 0 ) {
4818	--zSig1;
4819	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4820	}
4821	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4822	}
4823	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4824	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4825
4826	}
4827
4828	/*----------------------------------------------------------------------------
4829	\| Returns the remainder of the quadruple-precision floating-point value `a'
4830	\| with respect to the corresponding value `b'. The operation is performed
4831	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4832	----------------------------------------------------------------------------/
4833
4834	float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4835	{
4836	flag aSign, bSign, zSign;
4837	int32 aExp, bExp, expDiff;
4838	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4839	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4840	sbits64 sigMean0;
4841	float128 z;
4842
4843	aSig1 = extractFloat128Frac1( a );
4844	aSig0 = extractFloat128Frac0( a );
4845	aExp = extractFloat128Exp( a );
4846	aSign = extractFloat128Sign( a );
4847	bSig1 = extractFloat128Frac1( b );
4848	bSig0 = extractFloat128Frac0( b );
4849	bExp = extractFloat128Exp( b );
4850	bSign = extractFloat128Sign( b );
4851	if ( aExp == 0x7FFF ) {
4852	if ( ( aSig0 \| aSig1 )
4853	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4854	return propagateFloat128NaN( a, b STATUS_VAR );
4855	}
4856	goto invalid;
4857	}
4858	if ( bExp == 0x7FFF ) {
4859	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4860	return a;
4861	}
4862	if ( bExp == 0 ) {
4863	if ( ( bSig0 \| bSig1 ) == 0 ) {
4864	invalid:
4865	float_raise( float_flag_invalid STATUS_VAR);
4866	z.low = float128_default_nan_low;
4867	z.high = float128_default_nan_high;
4868	return z;
4869	}
4870	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4871	}
4872	if ( aExp == 0 ) {
4873	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
4874	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4875	}
4876	expDiff = aExp - bExp;
4877	if ( expDiff < -1 ) return a;
4878	shortShift128Left(
4879	aSig0 \| LIT64( 0x0001000000000000 ),
4880	aSig1,
4881	15 - ( expDiff < 0 ),
4882	&aSig0,
4883	&aSig1
4884	);
4885	shortShift128Left(
4886	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4887	q = le128( bSig0, bSig1, aSig0, aSig1 );
4888	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4889	expDiff -= 64;
4890	while ( 0 < expDiff ) {
4891	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4892	q = ( 4 < q ) ? q - 4 : 0;
4893	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4894	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4895	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4896	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4897	expDiff -= 61;
4898	}
4899	if ( -64 < expDiff ) {
4900	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4901	q = ( 4 < q ) ? q - 4 : 0;
4902	q >>= - expDiff;
4903	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904	expDiff += 52;
4905	if ( expDiff < 0 ) {
4906	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4907	}
4908	else {
4909	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4910	}
4911	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4912	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4913	}
4914	else {
4915	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4916	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4917	}
4918	do {
4919	alternateASig0 = aSig0;
4920	alternateASig1 = aSig1;
4921	++q;
4922	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4923	} while ( 0 <= (sbits64) aSig0 );
4924	add128(
4925	aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4926	if ( ( sigMean0 < 0 )
4927	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4928	aSig0 = alternateASig0;
4929	aSig1 = alternateASig1;
4930	}
4931	zSign = ( (sbits64) aSig0 < 0 );
4932	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4933	return
4934	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4935
4936	}
4937
4938	/*----------------------------------------------------------------------------
4939	\| Returns the square root of the quadruple-precision floating-point value `a'.
4940	\| The operation is performed according to the IEC/IEEE Standard for Binary
4941	\| Floating-Point Arithmetic.
4942	----------------------------------------------------------------------------/
4943
4944	float128 float128_sqrt( float128 a STATUS_PARAM )
4945	{
4946	flag aSign;
4947	int32 aExp, zExp;
4948	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4949	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4950	float128 z;
4951
4952	aSig1 = extractFloat128Frac1( a );
4953	aSig0 = extractFloat128Frac0( a );
4954	aExp = extractFloat128Exp( a );
4955	aSign = extractFloat128Sign( a );
4956	if ( aExp == 0x7FFF ) {
4957	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4958	if ( ! aSign ) return a;
4959	goto invalid;
4960	}
4961	if ( aSign ) {
4962	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
4963	invalid:
4964	float_raise( float_flag_invalid STATUS_VAR);
4965	z.low = float128_default_nan_low;
4966	z.high = float128_default_nan_high;
4967	return z;
4968	}
4969	if ( aExp == 0 ) {
4970	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4971	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4972	}
4973	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4974	aSig0 \|= LIT64( 0x0001000000000000 );
4975	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4976	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4977	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4978	doubleZSig0 = zSig0<<1;
4979	mul64To128( zSig0, zSig0, &term0, &term1 );
4980	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4981	while ( (sbits64) rem0 < 0 ) {
4982	--zSig0;
4983	doubleZSig0 -= 2;
4984	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
4985	}
4986	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4987	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4988	if ( zSig1 == 0 ) zSig1 = 1;
4989	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4990	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4991	mul64To128( zSig1, zSig1, &term2, &term3 );
4992	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4993	while ( (sbits64) rem1 < 0 ) {
4994	--zSig1;
4995	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4996	term3 \|= 1;
4997	term2 \|= doubleZSig0;
4998	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4999	}
5000	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
5001	}
5002	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5003	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5004
5005	}
5006
5007	/*----------------------------------------------------------------------------
5008	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5009	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5010	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011	----------------------------------------------------------------------------/
5012
5013	int float128_eq( float128 a, float128 b STATUS_PARAM )
5014	{
5015
5016	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5017	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5018	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5019	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5020	) {
5021	if ( float128_is_signaling_nan( a )
5022	\|\| float128_is_signaling_nan( b ) ) {
5023	float_raise( float_flag_invalid STATUS_VAR);
5024	}
5025	return 0;
5026	}
5027	return
5028	( a.low == b.low )
5029	&& ( ( a.high == b.high )
5030	\|\| ( ( a.low == 0 )
5031	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5032	);
5033
5034	}
5035
5036	/*----------------------------------------------------------------------------
5037	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5038	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
5039	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5040	\| Arithmetic.
5041	----------------------------------------------------------------------------/
5042
5043	int float128_le( float128 a, float128 b STATUS_PARAM )
5044	{
5045	flag aSign, bSign;
5046
5047	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5048	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5049	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5050	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5051	) {
5052	float_raise( float_flag_invalid STATUS_VAR);
5053	return 0;
5054	}
5055	aSign = extractFloat128Sign( a );
5056	bSign = extractFloat128Sign( b );
5057	if ( aSign != bSign ) {
5058	return
5059	aSign
5060	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5061	== 0 );
5062	}
5063	return
5064	aSign ? le128( b.high, b.low, a.high, a.low )
5065	: le128( a.high, a.low, b.high, b.low );
5066
5067	}
5068
5069	/*----------------------------------------------------------------------------
5070	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5071	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5072	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073	----------------------------------------------------------------------------/
5074
5075	int float128_lt( float128 a, float128 b STATUS_PARAM )
5076	{
5077	flag aSign, bSign;
5078
5079	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5080	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5081	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5082	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5083	) {
5084	float_raise( float_flag_invalid STATUS_VAR);
5085	return 0;
5086	}
5087	aSign = extractFloat128Sign( a );
5088	bSign = extractFloat128Sign( b );
5089	if ( aSign != bSign ) {
5090	return
5091	aSign
5092	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5093	!= 0 );
5094	}
5095	return
5096	aSign ? lt128( b.high, b.low, a.high, a.low )
5097	: lt128( a.high, a.low, b.high, b.low );
5098
5099	}
5100
5101	/*----------------------------------------------------------------------------
5102	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5103	\| the corresponding value `b', and 0 otherwise. The invalid exception is
5104	\| raised if either operand is a NaN. Otherwise, the comparison is performed
5105	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106	----------------------------------------------------------------------------/
5107
5108	int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5109	{
5110
5111	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5112	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5113	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5114	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5115	) {
5116	float_raise( float_flag_invalid STATUS_VAR);
5117	return 0;
5118	}
5119	return
5120	( a.low == b.low )
5121	&& ( ( a.high == b.high )
5122	\|\| ( ( a.low == 0 )
5123	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5124	);
5125
5126	}
5127
5128	/*----------------------------------------------------------------------------
5129	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5130	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5131	\| cause an exception. Otherwise, the comparison is performed according to the
5132	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5133	----------------------------------------------------------------------------/
5134
5135	int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5136	{
5137	flag aSign, bSign;
5138
5139	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5140	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5141	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5142	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5143	) {
5144	if ( float128_is_signaling_nan( a )
5145	\|\| float128_is_signaling_nan( b ) ) {
5146	float_raise( float_flag_invalid STATUS_VAR);
5147	}
5148	return 0;
5149	}
5150	aSign = extractFloat128Sign( a );
5151	bSign = extractFloat128Sign( b );
5152	if ( aSign != bSign ) {
5153	return
5154	aSign
5155	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5156	== 0 );
5157	}
5158	return
5159	aSign ? le128( b.high, b.low, a.high, a.low )
5160	: le128( a.high, a.low, b.high, b.low );
5161
5162	}
5163
5164	/*----------------------------------------------------------------------------
5165	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5166	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5167	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5168	\| Standard for Binary Floating-Point Arithmetic.
5169	----------------------------------------------------------------------------/
5170
5171	int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5172	{
5173	flag aSign, bSign;
5174
5175	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5176	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5177	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5178	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5179	) {
5180	if ( float128_is_signaling_nan( a )
5181	\|\| float128_is_signaling_nan( b ) ) {
5182	float_raise( float_flag_invalid STATUS_VAR);
5183	}
5184	return 0;
5185	}
5186	aSign = extractFloat128Sign( a );
5187	bSign = extractFloat128Sign( b );
5188	if ( aSign != bSign ) {
5189	return
5190	aSign
5191	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5192	!= 0 );
5193	}
5194	return
5195	aSign ? lt128( b.high, b.low, a.high, a.low )
5196	: lt128( a.high, a.low, b.high, b.low );
5197
5198	}
5199
5200	#endif
5201
5202	/* misc functions */
5203	float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5204	{
5205	return int64_to_float32(a STATUS_VAR);
5206	}
5207
5208	float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5209	{
5210	return int64_to_float64(a STATUS_VAR);
5211	}
5212
5213	unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5214	{
5215	int64_t v;
5216	unsigned int res;
5217
5218	v = float32_to_int64(a STATUS_VAR);
5219	if (v < 0) {
5220	res = 0;
5221	float_raise( float_flag_invalid STATUS_VAR);
5222	} else if (v > 0xffffffff) {
5223	res = 0xffffffff;
5224	float_raise( float_flag_invalid STATUS_VAR);
5225	} else {
5226	res = v;
5227	}
5228	return res;
5229	}
5230
5231	unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5232	{
5233	int64_t v;
5234	unsigned int res;
5235
5236	v = float32_to_int64_round_to_zero(a STATUS_VAR);
5237	if (v < 0) {
5238	res = 0;
5239	float_raise( float_flag_invalid STATUS_VAR);
5240	} else if (v > 0xffffffff) {
5241	res = 0xffffffff;
5242	float_raise( float_flag_invalid STATUS_VAR);
5243	} else {
5244	res = v;
5245	}
5246	return res;
5247	}
5248
5249	unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5250	{
5251	int64_t v;
5252	unsigned int res;
5253
5254	v = float64_to_int64(a STATUS_VAR);
5255	if (v < 0) {
5256	res = 0;
5257	float_raise( float_flag_invalid STATUS_VAR);
5258	} else if (v > 0xffffffff) {
5259	res = 0xffffffff;
5260	float_raise( float_flag_invalid STATUS_VAR);
5261	} else {
5262	res = v;
5263	}
5264	return res;
5265	}
5266
5267	unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5268	{
5269	int64_t v;
5270	unsigned int res;
5271
5272	v = float64_to_int64_round_to_zero(a STATUS_VAR);
5273	if (v < 0) {
5274	res = 0;
5275	float_raise( float_flag_invalid STATUS_VAR);
5276	} else if (v > 0xffffffff) {
5277	res = 0xffffffff;
5278	float_raise( float_flag_invalid STATUS_VAR);
5279	} else {
5280	res = v;
5281	}
5282	return res;
5283	}
5284
5285	#define COMPARE(s, nan_exp) \
5286	INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5287	int is_quiet STATUS_PARAM ) \
5288	{ \
5289	flag aSign, bSign; \
5290	\
5291	if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5292	extractFloat ## s ## Frac( a ) ) \|\| \
5293	( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5294	extractFloat ## s ## Frac( b ) )) { \
5295	if (!is_quiet \|\| \
5296	float ## s ## _is_signaling_nan( a ) \|\| \
5297	float ## s ## _is_signaling_nan( b ) ) { \
5298	float_raise( float_flag_invalid STATUS_VAR); \
5299	} \
5300	return float_relation_unordered; \
5301	} \
5302	aSign = extractFloat ## s ## Sign( a ); \
5303	bSign = extractFloat ## s ## Sign( b ); \
5304	if ( aSign != bSign ) { \
5305	if ( (bits ## s) ( ( a \| b )<<1 ) == 0 ) { \
5306	/* zero case */ \
5307	return float_relation_equal; \
5308	} else { \
5309	return 1 - (2 * aSign); \
5310	} \
5311	} else { \
5312	if (a == b) { \
5313	return float_relation_equal; \
5314	} else { \
5315	return 1 - 2 * (aSign ^ ( a < b )); \
5316	} \
5317	} \
5318	} \
5319	\
5320	int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5321	{ \
5322	return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5323	} \
5324	\
5325	int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5326	{ \
5327	return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5328	}
5329
5330	COMPARE(32, 0xff)
5331	COMPARE(64, 0x7ff)

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/recompiler/fpu/softfloat.c@ 13728

以其他格式下載: