softfloat.c@ 22173

最後變更在這個檔案從22173是 21292,由 vboxsync 提交於 15 年前
svn:eol-style native
屬性 svn:eol-style 設為 `native`
檔案大小: 198.1 KB

行
1
2	/*============================================================================
3
4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5	Package, Release 2b.
6
7	Written by John R. Hauser. This work was made possible in part by the
8	International Computer Science Institute, located at Suite 600, 1947 Center
9	Street, Berkeley, California 94704. Funding was partially provided by the
10	National Science Foundation under grant MIP-9311980. The original version
11	of this code was written as part of a project to build a fixed-point vector
12	processor in collaboration with the University of California at Berkeley,
13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15	arithmetic/SoftFloat.html'.
16
17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26	Derivative works are acceptable, even for commercial purposes, so long as
27	(1) the source code for the derivative work includes prominent notice that
28	the work is derivative, and (2) the source code includes prominent notice with
29	these four paragraphs for those parts of this code that are retained.
30
31	=============================================================================*/
32
33	/* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34	be flushed to zero. */
35	#include "softfloat.h"
36
37	/*----------------------------------------------------------------------------
38	\| Primitive arithmetic functions, including multi-word arithmetic, and
39	\| division and square root approximations. (Can be specialized to target if
40	\| desired.)
41	----------------------------------------------------------------------------/
42	#include "softfloat-macros.h"
43
44	/*----------------------------------------------------------------------------
45	\| Functions and definitions to determine: (1) whether tininess for underflow
46	\| is detected before or after rounding by default, (2) what (if anything)
47	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
48	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49	\| are propagated from function inputs to output. These details are target-
50	\| specific.
51	----------------------------------------------------------------------------/
52	#include "softfloat-specialize.h"
53
54	void set_float_rounding_mode(int val STATUS_PARAM)
55	{
56	STATUS(float_rounding_mode) = val;
57	}
58
59	void set_float_exception_flags(int val STATUS_PARAM)
60	{
61	STATUS(float_exception_flags) = val;
62	}
63
64	#ifdef FLOATX80
65	void set_floatx80_rounding_precision(int val STATUS_PARAM)
66	{
67	STATUS(floatx80_rounding_precision) = val;
68	}
69	#endif
70
71	/*----------------------------------------------------------------------------
72	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
74	\| input. If `zSign' is 1, the input is negated before being converted to an
75	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76	\| is simply rounded to an integer, with the inexact exception raised if the
77	\| input cannot be represented exactly as an integer. However, if the fixed-
78	\| point input is too large, the invalid exception is raised and the largest
79	\| positive or negative integer is returned.
80	----------------------------------------------------------------------------/
81
82	static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83	{
84	int8 roundingMode;
85	flag roundNearestEven;
86	int8 roundIncrement, roundBits;
87	int32 z;
88
89	roundingMode = STATUS(float_rounding_mode);
90	roundNearestEven = ( roundingMode == float_round_nearest_even );
91	roundIncrement = 0x40;
92	if ( ! roundNearestEven ) {
93	if ( roundingMode == float_round_to_zero ) {
94	roundIncrement = 0;
95	}
96	else {
97	roundIncrement = 0x7F;
98	if ( zSign ) {
99	if ( roundingMode == float_round_up ) roundIncrement = 0;
100	}
101	else {
102	if ( roundingMode == float_round_down ) roundIncrement = 0;
103	}
104	}
105	}
106	roundBits = absZ & 0x7F;
107	absZ = ( absZ + roundIncrement )>>7;
108	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109	z = absZ;
110	if ( zSign ) z = - z;
111	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112	float_raise( float_flag_invalid STATUS_VAR);
113	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114	}
115	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
116	return z;
117
118	}
119
120	/*----------------------------------------------------------------------------
121	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
123	\| and returns the properly rounded 64-bit integer corresponding to the input.
124	\| If `zSign' is 1, the input is negated before being converted to an integer.
125	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
126	\| the inexact exception raised if the input cannot be represented exactly as
127	\| an integer. However, if the fixed-point input is too large, the invalid
128	\| exception is raised and the largest positive or negative integer is
129	\| returned.
130	----------------------------------------------------------------------------/
131
132	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133	{
134	int8 roundingMode;
135	flag roundNearestEven, increment;
136	int64 z;
137
138	roundingMode = STATUS(float_rounding_mode);
139	roundNearestEven = ( roundingMode == float_round_nearest_even );
140	increment = ( (sbits64) absZ1 < 0 );
141	if ( ! roundNearestEven ) {
142	if ( roundingMode == float_round_to_zero ) {
143	increment = 0;
144	}
145	else {
146	if ( zSign ) {
147	increment = ( roundingMode == float_round_down ) && absZ1;
148	}
149	else {
150	increment = ( roundingMode == float_round_up ) && absZ1;
151	}
152	}
153	}
154	if ( increment ) {
155	++absZ0;
156	if ( absZ0 == 0 ) goto overflow;
157	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158	}
159	z = absZ0;
160	if ( zSign ) z = - z;
161	if ( z && ( ( z < 0 ) ^ zSign ) ) {
162	overflow:
163	float_raise( float_flag_invalid STATUS_VAR);
164	return
165	zSign ? (sbits64) LIT64( 0x8000000000000000 )
166	: LIT64( 0x7FFFFFFFFFFFFFFF );
167	}
168	if ( absZ1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
169	return z;
170
171	}
172
173	/*----------------------------------------------------------------------------
174	\| Returns the fraction bits of the single-precision floating-point value `a'.
175	----------------------------------------------------------------------------/
176
177	INLINE bits32 extractFloat32Frac( float32 a )
178	{
179
180	return float32_val(a) & 0x007FFFFF;
181
182	}
183
184	/*----------------------------------------------------------------------------
185	\| Returns the exponent bits of the single-precision floating-point value `a'.
186	----------------------------------------------------------------------------/
187
188	INLINE int16 extractFloat32Exp( float32 a )
189	{
190
191	return ( float32_val(a)>>23 ) & 0xFF;
192
193	}
194
195	/*----------------------------------------------------------------------------
196	\| Returns the sign bit of the single-precision floating-point value `a'.
197	----------------------------------------------------------------------------/
198
199	INLINE flag extractFloat32Sign( float32 a )
200	{
201
202	return float32_val(a)>>31;
203
204	}
205
206	/*----------------------------------------------------------------------------
207	\| Normalizes the subnormal single-precision floating-point value represented
208	\| by the denormalized significand `aSig'. The normalized exponent and
209	\| significand are stored at the locations pointed to by `zExpPtr' and
210	\| `zSigPtr', respectively.
211	----------------------------------------------------------------------------/
212
213	static void
214	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
215	{
216	int8 shiftCount;
217
218	shiftCount = countLeadingZeros32( aSig ) - 8;
219	*zSigPtr = aSig<<shiftCount;
220	*zExpPtr = 1 - shiftCount;
221
222	}
223
224	/*----------------------------------------------------------------------------
225	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226	\| single-precision floating-point value, returning the result. After being
227	\| shifted into the proper positions, the three fields are simply added
228	\| together to form the result. This means that any integer portion of `zSig'
229	\| will be added into the exponent. Since a properly normalized significand
230	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
231	\| than the desired result exponent whenever `zSig' is a complete, normalized
232	\| significand.
233	----------------------------------------------------------------------------/
234
235	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236	{
237
238	return make_float32(
239	( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240
241	}
242
243	/*----------------------------------------------------------------------------
244	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245	\| and significand `zSig', and returns the proper single-precision floating-
246	\| point value corresponding to the abstract input. Ordinarily, the abstract
247	\| value is simply rounded and packed into the single-precision format, with
248	\| the inexact exception raised if the abstract input cannot be represented
249	\| exactly. However, if the abstract value is too large, the overflow and
250	\| inexact exceptions are raised and an infinity or maximal finite value is
251	\| returned. If the abstract value is too small, the input value is rounded to
252	\| a subnormal number, and the underflow and inexact exceptions are raised if
253	\| the abstract input cannot be represented exactly as a subnormal single-
254	\| precision floating-point number.
255	\| The input significand `zSig' has its binary point between bits 30
256	\| and 29, which is 7 bits to the left of the usual location. This shifted
257	\| significand must be normalized or smaller. If `zSig' is not normalized,
258	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
259	\| and it must not require rounding. In the usual case that `zSig' is
260	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
262	\| Binary Floating-Point Arithmetic.
263	----------------------------------------------------------------------------/
264
265	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266	{
267	int8 roundingMode;
268	flag roundNearestEven;
269	int8 roundIncrement, roundBits;
270	flag isTiny;
271
272	roundingMode = STATUS(float_rounding_mode);
273	roundNearestEven = ( roundingMode == float_round_nearest_even );
274	roundIncrement = 0x40;
275	if ( ! roundNearestEven ) {
276	if ( roundingMode == float_round_to_zero ) {
277	roundIncrement = 0;
278	}
279	else {
280	roundIncrement = 0x7F;
281	if ( zSign ) {
282	if ( roundingMode == float_round_up ) roundIncrement = 0;
283	}
284	else {
285	if ( roundingMode == float_round_down ) roundIncrement = 0;
286	}
287	}
288	}
289	roundBits = zSig & 0x7F;
290	if ( 0xFD <= (bits16) zExp ) {
291	if ( ( 0xFD < zExp )
292	\|\| ( ( zExp == 0xFD )
293	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294	) {
295	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
296	return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297	}
298	if ( zExp < 0 ) {
299	if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300	isTiny =
301	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302	\|\| ( zExp < -1 )
303	\|\| ( zSig + roundIncrement < 0x80000000 );
304	shift32RightJamming( zSig, - zExp, &zSig );
305	zExp = 0;
306	roundBits = zSig & 0x7F;
307	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308	}
309	}
310	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
311	zSig = ( zSig + roundIncrement )>>7;
312	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313	if ( zSig == 0 ) zExp = 0;
314	return packFloat32( zSign, zExp, zSig );
315
316	}
317
318	/*----------------------------------------------------------------------------
319	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320	\| and significand `zSig', and returns the proper single-precision floating-
321	\| point value corresponding to the abstract input. This routine is just like
322	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324	\| floating-point exponent.
325	----------------------------------------------------------------------------/
326
327	static float32
328	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329	{
330	int8 shiftCount;
331
332	shiftCount = countLeadingZeros32( zSig ) - 1;
333	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334
335	}
336
337	/*----------------------------------------------------------------------------
338	\| Returns the fraction bits of the double-precision floating-point value `a'.
339	----------------------------------------------------------------------------/
340
341	INLINE bits64 extractFloat64Frac( float64 a )
342	{
343
344	return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345
346	}
347
348	/*----------------------------------------------------------------------------
349	\| Returns the exponent bits of the double-precision floating-point value `a'.
350	----------------------------------------------------------------------------/
351
352	INLINE int16 extractFloat64Exp( float64 a )
353	{
354
355	return ( float64_val(a)>>52 ) & 0x7FF;
356
357	}
358
359	/*----------------------------------------------------------------------------
360	\| Returns the sign bit of the double-precision floating-point value `a'.
361	----------------------------------------------------------------------------/
362
363	INLINE flag extractFloat64Sign( float64 a )
364	{
365
366	return float64_val(a)>>63;
367
368	}
369
370	/*----------------------------------------------------------------------------
371	\| Normalizes the subnormal double-precision floating-point value represented
372	\| by the denormalized significand `aSig'. The normalized exponent and
373	\| significand are stored at the locations pointed to by `zExpPtr' and
374	\| `zSigPtr', respectively.
375	----------------------------------------------------------------------------/
376
377	static void
378	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
379	{
380	int8 shiftCount;
381
382	shiftCount = countLeadingZeros64( aSig ) - 11;
383	*zSigPtr = aSig<<shiftCount;
384	*zExpPtr = 1 - shiftCount;
385
386	}
387
388	/*----------------------------------------------------------------------------
389	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390	\| double-precision floating-point value, returning the result. After being
391	\| shifted into the proper positions, the three fields are simply added
392	\| together to form the result. This means that any integer portion of `zSig'
393	\| will be added into the exponent. Since a properly normalized significand
394	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
395	\| than the desired result exponent whenever `zSig' is a complete, normalized
396	\| significand.
397	----------------------------------------------------------------------------/
398
399	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400	{
401
402	return make_float64(
403	( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404
405	}
406
407	/*----------------------------------------------------------------------------
408	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409	\| and significand `zSig', and returns the proper double-precision floating-
410	\| point value corresponding to the abstract input. Ordinarily, the abstract
411	\| value is simply rounded and packed into the double-precision format, with
412	\| the inexact exception raised if the abstract input cannot be represented
413	\| exactly. However, if the abstract value is too large, the overflow and
414	\| inexact exceptions are raised and an infinity or maximal finite value is
415	\| returned. If the abstract value is too small, the input value is rounded
416	\| to a subnormal number, and the underflow and inexact exceptions are raised
417	\| if the abstract input cannot be represented exactly as a subnormal double-
418	\| precision floating-point number.
419	\| The input significand `zSig' has its binary point between bits 62
420	\| and 61, which is 10 bits to the left of the usual location. This shifted
421	\| significand must be normalized or smaller. If `zSig' is not normalized,
422	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
423	\| and it must not require rounding. In the usual case that `zSig' is
424	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
426	\| Binary Floating-Point Arithmetic.
427	----------------------------------------------------------------------------/
428
429	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430	{
431	int8 roundingMode;
432	flag roundNearestEven;
433	int16 roundIncrement, roundBits;
434	flag isTiny;
435
436	roundingMode = STATUS(float_rounding_mode);
437	roundNearestEven = ( roundingMode == float_round_nearest_even );
438	roundIncrement = 0x200;
439	if ( ! roundNearestEven ) {
440	if ( roundingMode == float_round_to_zero ) {
441	roundIncrement = 0;
442	}
443	else {
444	roundIncrement = 0x3FF;
445	if ( zSign ) {
446	if ( roundingMode == float_round_up ) roundIncrement = 0;
447	}
448	else {
449	if ( roundingMode == float_round_down ) roundIncrement = 0;
450	}
451	}
452	}
453	roundBits = zSig & 0x3FF;
454	if ( 0x7FD <= (bits16) zExp ) {
455	if ( ( 0x7FD < zExp )
456	\|\| ( ( zExp == 0x7FD )
457	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458	) {
459	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
460	return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461	}
462	if ( zExp < 0 ) {
463	if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464	isTiny =
465	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466	\|\| ( zExp < -1 )
467	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468	shift64RightJamming( zSig, - zExp, &zSig );
469	zExp = 0;
470	roundBits = zSig & 0x3FF;
471	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472	}
473	}
474	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
475	zSig = ( zSig + roundIncrement )>>10;
476	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477	if ( zSig == 0 ) zExp = 0;
478	return packFloat64( zSign, zExp, zSig );
479
480	}
481
482	/*----------------------------------------------------------------------------
483	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484	\| and significand `zSig', and returns the proper double-precision floating-
485	\| point value corresponding to the abstract input. This routine is just like
486	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488	\| floating-point exponent.
489	----------------------------------------------------------------------------/
490
491	static float64
492	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493	{
494	int8 shiftCount;
495
496	shiftCount = countLeadingZeros64( zSig ) - 1;
497	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498
499	}
500
501	#ifdef FLOATX80
502
503	/*----------------------------------------------------------------------------
504	\| Returns the fraction bits of the extended double-precision floating-point
505	\| value `a'.
506	----------------------------------------------------------------------------/
507
508	INLINE bits64 extractFloatx80Frac( floatx80 a )
509	{
510
511	return a.low;
512
513	}
514
515	/*----------------------------------------------------------------------------
516	\| Returns the exponent bits of the extended double-precision floating-point
517	\| value `a'.
518	----------------------------------------------------------------------------/
519
520	INLINE int32 extractFloatx80Exp( floatx80 a )
521	{
522
523	return a.high & 0x7FFF;
524
525	}
526
527	/*----------------------------------------------------------------------------
528	\| Returns the sign bit of the extended double-precision floating-point value
529	\| `a'.
530	----------------------------------------------------------------------------/
531
532	INLINE flag extractFloatx80Sign( floatx80 a )
533	{
534
535	return a.high>>15;
536
537	}
538
539	/*----------------------------------------------------------------------------
540	\| Normalizes the subnormal extended double-precision floating-point value
541	\| represented by the denormalized significand `aSig'. The normalized exponent
542	\| and significand are stored at the locations pointed to by `zExpPtr' and
543	\| `zSigPtr', respectively.
544	----------------------------------------------------------------------------/
545
546	static void
547	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
548	{
549	int8 shiftCount;
550
551	shiftCount = countLeadingZeros64( aSig );
552	*zSigPtr = aSig<<shiftCount;
553	*zExpPtr = 1 - shiftCount;
554
555	}
556
557	/*----------------------------------------------------------------------------
558	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559	\| extended double-precision floating-point value, returning the result.
560	----------------------------------------------------------------------------/
561
562	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563	{
564	floatx80 z;
565
566	z.low = zSig;
567	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568	return z;
569
570	}
571
572	/*----------------------------------------------------------------------------
573	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
575	\| and returns the proper extended double-precision floating-point value
576	\| corresponding to the abstract input. Ordinarily, the abstract value is
577	\| rounded and packed into the extended double-precision format, with the
578	\| inexact exception raised if the abstract input cannot be represented
579	\| exactly. However, if the abstract value is too large, the overflow and
580	\| inexact exceptions are raised and an infinity or maximal finite value is
581	\| returned. If the abstract value is too small, the input value is rounded to
582	\| a subnormal number, and the underflow and inexact exceptions are raised if
583	\| the abstract input cannot be represented exactly as a subnormal extended
584	\| double-precision floating-point number.
585	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
586	\| number of bits as single or double precision, respectively. Otherwise, the
587	\| result is rounded to the full precision of the extended double-precision
588	\| format.
589	\| The input significand must be normalized or smaller. If the input
590	\| significand is not normalized, `zExp' must be 0; in that case, the result
591	\| returned is a subnormal number, and it must not require rounding. The
592	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593	\| Floating-Point Arithmetic.
594	----------------------------------------------------------------------------/
595
596	static floatx80
597	roundAndPackFloatx80(
598	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599	STATUS_PARAM)
600	{
601	int8 roundingMode;
602	flag roundNearestEven, increment, isTiny;
603	int64 roundIncrement, roundMask, roundBits;
604
605	roundingMode = STATUS(float_rounding_mode);
606	roundNearestEven = ( roundingMode == float_round_nearest_even );
607	if ( roundingPrecision == 80 ) goto precision80;
608	if ( roundingPrecision == 64 ) {
609	roundIncrement = LIT64( 0x0000000000000400 );
610	roundMask = LIT64( 0x00000000000007FF );
611	}
612	else if ( roundingPrecision == 32 ) {
613	roundIncrement = LIT64( 0x0000008000000000 );
614	roundMask = LIT64( 0x000000FFFFFFFFFF );
615	}
616	else {
617	goto precision80;
618	}
619	zSig0 \|= ( zSig1 != 0 );
620	if ( ! roundNearestEven ) {
621	if ( roundingMode == float_round_to_zero ) {
622	roundIncrement = 0;
623	}
624	else {
625	roundIncrement = roundMask;
626	if ( zSign ) {
627	if ( roundingMode == float_round_up ) roundIncrement = 0;
628	}
629	else {
630	if ( roundingMode == float_round_down ) roundIncrement = 0;
631	}
632	}
633	}
634	roundBits = zSig0 & roundMask;
635	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636	if ( ( 0x7FFE < zExp )
637	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638	) {
639	goto overflow;
640	}
641	if ( zExp <= 0 ) {
642	if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643	isTiny =
644	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645	\|\| ( zExp < 0 )
646	\|\| ( zSig0 <= zSig0 + roundIncrement );
647	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648	zExp = 0;
649	roundBits = zSig0 & roundMask;
650	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
652	zSig0 += roundIncrement;
653	if ( (sbits64) zSig0 < 0 ) zExp = 1;
654	roundIncrement = roundMask + 1;
655	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656	roundMask \|= roundIncrement;
657	}
658	zSig0 &= ~ roundMask;
659	return packFloatx80( zSign, zExp, zSig0 );
660	}
661	}
662	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
663	zSig0 += roundIncrement;
664	if ( zSig0 < roundIncrement ) {
665	++zExp;
666	zSig0 = LIT64( 0x8000000000000000 );
667	}
668	roundIncrement = roundMask + 1;
669	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670	roundMask \|= roundIncrement;
671	}
672	zSig0 &= ~ roundMask;
673	if ( zSig0 == 0 ) zExp = 0;
674	return packFloatx80( zSign, zExp, zSig0 );
675	precision80:
676	increment = ( (sbits64) zSig1 < 0 );
677	if ( ! roundNearestEven ) {
678	if ( roundingMode == float_round_to_zero ) {
679	increment = 0;
680	}
681	else {
682	if ( zSign ) {
683	increment = ( roundingMode == float_round_down ) && zSig1;
684	}
685	else {
686	increment = ( roundingMode == float_round_up ) && zSig1;
687	}
688	}
689	}
690	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691	if ( ( 0x7FFE < zExp )
692	\|\| ( ( zExp == 0x7FFE )
693	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694	&& increment
695	)
696	) {
697	roundMask = 0;
698	overflow:
699	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
700	if ( ( roundingMode == float_round_to_zero )
701	\|\| ( zSign && ( roundingMode == float_round_up ) )
702	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
703	) {
704	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705	}
706	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707	}
708	if ( zExp <= 0 ) {
709	isTiny =
710	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711	\|\| ( zExp < 0 )
712	\|\| ! increment
713	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715	zExp = 0;
716	if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
718	if ( roundNearestEven ) {
719	increment = ( (sbits64) zSig1 < 0 );
720	}
721	else {
722	if ( zSign ) {
723	increment = ( roundingMode == float_round_down ) && zSig1;
724	}
725	else {
726	increment = ( roundingMode == float_round_up ) && zSig1;
727	}
728	}
729	if ( increment ) {
730	++zSig0;
731	zSig0 &=
732	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733	if ( (sbits64) zSig0 < 0 ) zExp = 1;
734	}
735	return packFloatx80( zSign, zExp, zSig0 );
736	}
737	}
738	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
739	if ( increment ) {
740	++zSig0;
741	if ( zSig0 == 0 ) {
742	++zExp;
743	zSig0 = LIT64( 0x8000000000000000 );
744	}
745	else {
746	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747	}
748	}
749	else {
750	if ( zSig0 == 0 ) zExp = 0;
751	}
752	return packFloatx80( zSign, zExp, zSig0 );
753
754	}
755
756	/*----------------------------------------------------------------------------
757	\| Takes an abstract floating-point value having sign `zSign', exponent
758	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759	\| and returns the proper extended double-precision floating-point value
760	\| corresponding to the abstract input. This routine is just like
761	\| `roundAndPackFloatx80' except that the input significand does not have to be
762	\| normalized.
763	----------------------------------------------------------------------------/
764
765	static floatx80
766	normalizeRoundAndPackFloatx80(
767	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768	STATUS_PARAM)
769	{
770	int8 shiftCount;
771
772	if ( zSig0 == 0 ) {
773	zSig0 = zSig1;
774	zSig1 = 0;
775	zExp -= 64;
776	}
777	shiftCount = countLeadingZeros64( zSig0 );
778	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779	zExp -= shiftCount;
780	return
781	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782
783	}
784
785	#endif
786
787	#ifdef FLOAT128
788
789	/*----------------------------------------------------------------------------
790	\| Returns the least-significant 64 fraction bits of the quadruple-precision
791	\| floating-point value `a'.
792	----------------------------------------------------------------------------/
793
794	INLINE bits64 extractFloat128Frac1( float128 a )
795	{
796
797	return a.low;
798
799	}
800
801	/*----------------------------------------------------------------------------
802	\| Returns the most-significant 48 fraction bits of the quadruple-precision
803	\| floating-point value `a'.
804	----------------------------------------------------------------------------/
805
806	INLINE bits64 extractFloat128Frac0( float128 a )
807	{
808
809	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810
811	}
812
813	/*----------------------------------------------------------------------------
814	\| Returns the exponent bits of the quadruple-precision floating-point value
815	\| `a'.
816	----------------------------------------------------------------------------/
817
818	INLINE int32 extractFloat128Exp( float128 a )
819	{
820
821	return ( a.high>>48 ) & 0x7FFF;
822
823	}
824
825	/*----------------------------------------------------------------------------
826	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
827	----------------------------------------------------------------------------/
828
829	INLINE flag extractFloat128Sign( float128 a )
830	{
831
832	return a.high>>63;
833
834	}
835
836	/*----------------------------------------------------------------------------
837	\| Normalizes the subnormal quadruple-precision floating-point value
838	\| represented by the denormalized significand formed by the concatenation of
839	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
840	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841	\| significand are stored at the location pointed to by `zSig0Ptr', and the
842	\| least significant 64 bits of the normalized significand are stored at the
843	\| location pointed to by `zSig1Ptr'.
844	----------------------------------------------------------------------------/
845
846	static void
847	normalizeFloat128Subnormal(
848	bits64 aSig0,
849	bits64 aSig1,
850	int32 *zExpPtr,
851	bits64 *zSig0Ptr,
852	bits64 *zSig1Ptr
853	)
854	{
855	int8 shiftCount;
856
857	if ( aSig0 == 0 ) {
858	shiftCount = countLeadingZeros64( aSig1 ) - 15;
859	if ( shiftCount < 0 ) {
860	*zSig0Ptr = aSig1>>( - shiftCount );
861	*zSig1Ptr = aSig1<<( shiftCount & 63 );
862	}
863	else {
864	*zSig0Ptr = aSig1<<shiftCount;
865	*zSig1Ptr = 0;
866	}
867	*zExpPtr = - shiftCount - 63;
868	}
869	else {
870	shiftCount = countLeadingZeros64( aSig0 ) - 15;
871	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872	*zExpPtr = 1 - shiftCount;
873	}
874
875	}
876
877	/*----------------------------------------------------------------------------
878	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
879	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880	\| floating-point value, returning the result. After being shifted into the
881	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882	\| added together to form the most significant 32 bits of the result. This
883	\| means that any integer portion of `zSig0' will be added into the exponent.
884	\| Since a properly normalized significand will have an integer portion equal
885	\| to 1, the `zExp' input should be 1 less than the desired result exponent
886	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887	\| significand.
888	----------------------------------------------------------------------------/
889
890	INLINE float128
891	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892	{
893	float128 z;
894
895	z.low = zSig1;
896	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897	return z;
898
899	}
900
901	/*----------------------------------------------------------------------------
902	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
904	\| and `zSig2', and returns the proper quadruple-precision floating-point value
905	\| corresponding to the abstract input. Ordinarily, the abstract value is
906	\| simply rounded and packed into the quadruple-precision format, with the
907	\| inexact exception raised if the abstract input cannot be represented
908	\| exactly. However, if the abstract value is too large, the overflow and
909	\| inexact exceptions are raised and an infinity or maximal finite value is
910	\| returned. If the abstract value is too small, the input value is rounded to
911	\| a subnormal number, and the underflow and inexact exceptions are raised if
912	\| the abstract input cannot be represented exactly as a subnormal quadruple-
913	\| precision floating-point number.
914	\| The input significand must be normalized or smaller. If the input
915	\| significand is not normalized, `zExp' must be 0; in that case, the result
916	\| returned is a subnormal number, and it must not require rounding. In the
917	\| usual case that the input significand is normalized, `zExp' must be 1 less
918	\| than the ``true'' floating-point exponent. The handling of underflow and
919	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920	----------------------------------------------------------------------------/
921
922	static float128
923	roundAndPackFloat128(
924	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925	{
926	int8 roundingMode;
927	flag roundNearestEven, increment, isTiny;
928
929	roundingMode = STATUS(float_rounding_mode);
930	roundNearestEven = ( roundingMode == float_round_nearest_even );
931	increment = ( (sbits64) zSig2 < 0 );
932	if ( ! roundNearestEven ) {
933	if ( roundingMode == float_round_to_zero ) {
934	increment = 0;
935	}
936	else {
937	if ( zSign ) {
938	increment = ( roundingMode == float_round_down ) && zSig2;
939	}
940	else {
941	increment = ( roundingMode == float_round_up ) && zSig2;
942	}
943	}
944	}
945	if ( 0x7FFD <= (bits32) zExp ) {
946	if ( ( 0x7FFD < zExp )
947	\|\| ( ( zExp == 0x7FFD )
948	&& eq128(
949	LIT64( 0x0001FFFFFFFFFFFF ),
950	LIT64( 0xFFFFFFFFFFFFFFFF ),
951	zSig0,
952	zSig1
953	)
954	&& increment
955	)
956	) {
957	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
958	if ( ( roundingMode == float_round_to_zero )
959	\|\| ( zSign && ( roundingMode == float_round_up ) )
960	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
961	) {
962	return
963	packFloat128(
964	zSign,
965	0x7FFE,
966	LIT64( 0x0000FFFFFFFFFFFF ),
967	LIT64( 0xFFFFFFFFFFFFFFFF )
968	);
969	}
970	return packFloat128( zSign, 0x7FFF, 0, 0 );
971	}
972	if ( zExp < 0 ) {
973	if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974	isTiny =
975	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976	\|\| ( zExp < -1 )
977	\|\| ! increment
978	\|\| lt128(
979	zSig0,
980	zSig1,
981	LIT64( 0x0001FFFFFFFFFFFF ),
982	LIT64( 0xFFFFFFFFFFFFFFFF )
983	);
984	shift128ExtraRightJamming(
985	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986	zExp = 0;
987	if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988	if ( roundNearestEven ) {
989	increment = ( (sbits64) zSig2 < 0 );
990	}
991	else {
992	if ( zSign ) {
993	increment = ( roundingMode == float_round_down ) && zSig2;
994	}
995	else {
996	increment = ( roundingMode == float_round_up ) && zSig2;
997	}
998	}
999	}
1000	}
1001	if ( zSig2 ) STATUS(float_exception_flags) \|= float_flag_inexact;
1002	if ( increment ) {
1003	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005	}
1006	else {
1007	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
1008	}
1009	return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011	}
1012
1013	/*----------------------------------------------------------------------------
1014	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016	\| returns the proper quadruple-precision floating-point value corresponding
1017	\| to the abstract input. This routine is just like `roundAndPackFloat128'
1018	\| except that the input significand has fewer bits and does not have to be
1019	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020	\| point exponent.
1021	----------------------------------------------------------------------------/
1022
1023	static float128
1024	normalizeRoundAndPackFloat128(
1025	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026	{
1027	int8 shiftCount;
1028	bits64 zSig2;
1029
1030	if ( zSig0 == 0 ) {
1031	zSig0 = zSig1;
1032	zSig1 = 0;
1033	zExp -= 64;
1034	}
1035	shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036	if ( 0 <= shiftCount ) {
1037	zSig2 = 0;
1038	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039	}
1040	else {
1041	shift128ExtraRightJamming(
1042	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043	}
1044	zExp -= shiftCount;
1045	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047	}
1048
1049	#endif
1050
1051	/*----------------------------------------------------------------------------
1052	\| Returns the result of converting the 32-bit two's complement integer `a'
1053	\| to the single-precision floating-point format. The conversion is performed
1054	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055	----------------------------------------------------------------------------/
1056
1057	float32 int32_to_float32( int32 a STATUS_PARAM )
1058	{
1059	flag zSign;
1060
1061	if ( a == 0 ) return float32_zero;
1062	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063	zSign = ( a < 0 );
1064	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066	}
1067
1068	/*----------------------------------------------------------------------------
1069	\| Returns the result of converting the 32-bit two's complement integer `a'
1070	\| to the double-precision floating-point format. The conversion is performed
1071	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072	----------------------------------------------------------------------------/
1073
1074	float64 int32_to_float64( int32 a STATUS_PARAM )
1075	{
1076	flag zSign;
1077	uint32 absA;
1078	int8 shiftCount;
1079	bits64 zSig;
1080
1081	if ( a == 0 ) return float64_zero;
1082	zSign = ( a < 0 );
1083	absA = zSign ? - a : a;
1084	shiftCount = countLeadingZeros32( absA ) + 21;
1085	zSig = absA;
1086	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088	}
1089
1090	#ifdef FLOATX80
1091
1092	/*----------------------------------------------------------------------------
1093	\| Returns the result of converting the 32-bit two's complement integer `a'
1094	\| to the extended double-precision floating-point format. The conversion
1095	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096	\| Arithmetic.
1097	----------------------------------------------------------------------------/
1098
1099	floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100	{
1101	flag zSign;
1102	uint32 absA;
1103	int8 shiftCount;
1104	bits64 zSig;
1105
1106	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107	zSign = ( a < 0 );
1108	absA = zSign ? - a : a;
1109	shiftCount = countLeadingZeros32( absA ) + 32;
1110	zSig = absA;
1111	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113	}
1114
1115	#endif
1116
1117	#ifdef FLOAT128
1118
1119	/*----------------------------------------------------------------------------
1120	\| Returns the result of converting the 32-bit two's complement integer `a' to
1121	\| the quadruple-precision floating-point format. The conversion is performed
1122	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123	----------------------------------------------------------------------------/
1124
1125	float128 int32_to_float128( int32 a STATUS_PARAM )
1126	{
1127	flag zSign;
1128	uint32 absA;
1129	int8 shiftCount;
1130	bits64 zSig0;
1131
1132	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133	zSign = ( a < 0 );
1134	absA = zSign ? - a : a;
1135	shiftCount = countLeadingZeros32( absA ) + 17;
1136	zSig0 = absA;
1137	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139	}
1140
1141	#endif
1142
1143	/*----------------------------------------------------------------------------
1144	\| Returns the result of converting the 64-bit two's complement integer `a'
1145	\| to the single-precision floating-point format. The conversion is performed
1146	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147	----------------------------------------------------------------------------/
1148
1149	float32 int64_to_float32( int64 a STATUS_PARAM )
1150	{
1151	flag zSign;
1152	uint64 absA;
1153	int8 shiftCount;
1154
1155	if ( a == 0 ) return float32_zero;
1156	zSign = ( a < 0 );
1157	absA = zSign ? - a : a;
1158	shiftCount = countLeadingZeros64( absA ) - 40;
1159	if ( 0 <= shiftCount ) {
1160	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161	}
1162	else {
1163	shiftCount += 7;
1164	if ( shiftCount < 0 ) {
1165	shift64RightJamming( absA, - shiftCount, &absA );
1166	}
1167	else {
1168	absA <<= shiftCount;
1169	}
1170	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171	}
1172
1173	}
1174
1175	float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176	{
1177	int8 shiftCount;
1178
1179	if ( a == 0 ) return float32_zero;
1180	shiftCount = countLeadingZeros64( a ) - 40;
1181	if ( 0 <= shiftCount ) {
1182	return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183	}
1184	else {
1185	shiftCount += 7;
1186	if ( shiftCount < 0 ) {
1187	shift64RightJamming( a, - shiftCount, &a );
1188	}
1189	else {
1190	a <<= shiftCount;
1191	}
1192	return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193	}
1194	}
1195
1196	/*----------------------------------------------------------------------------
1197	\| Returns the result of converting the 64-bit two's complement integer `a'
1198	\| to the double-precision floating-point format. The conversion is performed
1199	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200	----------------------------------------------------------------------------/
1201
1202	float64 int64_to_float64( int64 a STATUS_PARAM )
1203	{
1204	flag zSign;
1205
1206	if ( a == 0 ) return float64_zero;
1207	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208	return packFloat64( 1, 0x43E, 0 );
1209	}
1210	zSign = ( a < 0 );
1211	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213	}
1214
1215	float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216	{
1217	if ( a == 0 ) return float64_zero;
1218	return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220	}
1221
1222	#ifdef FLOATX80
1223
1224	/*----------------------------------------------------------------------------
1225	\| Returns the result of converting the 64-bit two's complement integer `a'
1226	\| to the extended double-precision floating-point format. The conversion
1227	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228	\| Arithmetic.
1229	----------------------------------------------------------------------------/
1230
1231	floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232	{
1233	flag zSign;
1234	uint64 absA;
1235	int8 shiftCount;
1236
1237	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238	zSign = ( a < 0 );
1239	absA = zSign ? - a : a;
1240	shiftCount = countLeadingZeros64( absA );
1241	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243	}
1244
1245	#endif
1246
1247	#ifdef FLOAT128
1248
1249	/*----------------------------------------------------------------------------
1250	\| Returns the result of converting the 64-bit two's complement integer `a' to
1251	\| the quadruple-precision floating-point format. The conversion is performed
1252	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253	----------------------------------------------------------------------------/
1254
1255	float128 int64_to_float128( int64 a STATUS_PARAM )
1256	{
1257	flag zSign;
1258	uint64 absA;
1259	int8 shiftCount;
1260	int32 zExp;
1261	bits64 zSig0, zSig1;
1262
1263	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264	zSign = ( a < 0 );
1265	absA = zSign ? - a : a;
1266	shiftCount = countLeadingZeros64( absA ) + 49;
1267	zExp = 0x406E - shiftCount;
1268	if ( 64 <= shiftCount ) {
1269	zSig1 = 0;
1270	zSig0 = absA;
1271	shiftCount -= 64;
1272	}
1273	else {
1274	zSig1 = absA;
1275	zSig0 = 0;
1276	}
1277	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278	return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280	}
1281
1282	#endif
1283
1284	/*----------------------------------------------------------------------------
1285	\| Returns the result of converting the single-precision floating-point value
1286	\| `a' to the 32-bit two's complement integer format. The conversion is
1287	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288	\| Arithmetic---which means in particular that the conversion is rounded
1289	\| according to the current rounding mode. If `a' is a NaN, the largest
1290	\| positive integer is returned. Otherwise, if the conversion overflows, the
1291	\| largest integer with the same sign as `a' is returned.
1292	----------------------------------------------------------------------------/
1293
1294	int32 float32_to_int32( float32 a STATUS_PARAM )
1295	{
1296	flag aSign;
1297	int16 aExp, shiftCount;
1298	bits32 aSig;
1299	bits64 aSig64;
1300
1301	aSig = extractFloat32Frac( a );
1302	aExp = extractFloat32Exp( a );
1303	aSign = extractFloat32Sign( a );
1304	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305	if ( aExp ) aSig \|= 0x00800000;
1306	shiftCount = 0xAF - aExp;
1307	aSig64 = aSig;
1308	aSig64 <<= 32;
1309	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310	return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312	}
1313
1314	/*----------------------------------------------------------------------------
1315	\| Returns the result of converting the single-precision floating-point value
1316	\| `a' to the 32-bit two's complement integer format. The conversion is
1317	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318	\| Arithmetic, except that the conversion is always rounded toward zero.
1319	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320	\| the conversion overflows, the largest integer with the same sign as `a' is
1321	\| returned.
1322	----------------------------------------------------------------------------/
1323
1324	int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325	{
1326	flag aSign;
1327	int16 aExp, shiftCount;
1328	bits32 aSig;
1329	int32 z;
1330
1331	aSig = extractFloat32Frac( a );
1332	aExp = extractFloat32Exp( a );
1333	aSign = extractFloat32Sign( a );
1334	shiftCount = aExp - 0x9E;
1335	if ( 0 <= shiftCount ) {
1336	if ( float32_val(a) != 0xCF000000 ) {
1337	float_raise( float_flag_invalid STATUS_VAR);
1338	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339	}
1340	return (sbits32) 0x80000000;
1341	}
1342	else if ( aExp <= 0x7E ) {
1343	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1344	return 0;
1345	}
1346	aSig = ( aSig \| 0x00800000 )<<8;
1347	z = aSig>>( - shiftCount );
1348	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349	STATUS(float_exception_flags) \|= float_flag_inexact;
1350	}
1351	if ( aSign ) z = - z;
1352	return z;
1353
1354	}
1355
1356	/*----------------------------------------------------------------------------
1357	\| Returns the result of converting the single-precision floating-point value
1358	\| `a' to the 64-bit two's complement integer format. The conversion is
1359	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360	\| Arithmetic---which means in particular that the conversion is rounded
1361	\| according to the current rounding mode. If `a' is a NaN, the largest
1362	\| positive integer is returned. Otherwise, if the conversion overflows, the
1363	\| largest integer with the same sign as `a' is returned.
1364	----------------------------------------------------------------------------/
1365
1366	int64 float32_to_int64( float32 a STATUS_PARAM )
1367	{
1368	flag aSign;
1369	int16 aExp, shiftCount;
1370	bits32 aSig;
1371	bits64 aSig64, aSigExtra;
1372
1373	aSig = extractFloat32Frac( a );
1374	aExp = extractFloat32Exp( a );
1375	aSign = extractFloat32Sign( a );
1376	shiftCount = 0xBE - aExp;
1377	if ( shiftCount < 0 ) {
1378	float_raise( float_flag_invalid STATUS_VAR);
1379	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1380	return LIT64( 0x7FFFFFFFFFFFFFFF );
1381	}
1382	return (sbits64) LIT64( 0x8000000000000000 );
1383	}
1384	if ( aExp ) aSig \|= 0x00800000;
1385	aSig64 = aSig;
1386	aSig64 <<= 40;
1387	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388	return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390	}
1391
1392	/*----------------------------------------------------------------------------
1393	\| Returns the result of converting the single-precision floating-point value
1394	\| `a' to the 64-bit two's complement integer format. The conversion is
1395	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396	\| Arithmetic, except that the conversion is always rounded toward zero. If
1397	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398	\| conversion overflows, the largest integer with the same sign as `a' is
1399	\| returned.
1400	----------------------------------------------------------------------------/
1401
1402	int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403	{
1404	flag aSign;
1405	int16 aExp, shiftCount;
1406	bits32 aSig;
1407	bits64 aSig64;
1408	int64 z;
1409
1410	aSig = extractFloat32Frac( a );
1411	aExp = extractFloat32Exp( a );
1412	aSign = extractFloat32Sign( a );
1413	shiftCount = aExp - 0xBE;
1414	if ( 0 <= shiftCount ) {
1415	if ( float32_val(a) != 0xDF000000 ) {
1416	float_raise( float_flag_invalid STATUS_VAR);
1417	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1418	return LIT64( 0x7FFFFFFFFFFFFFFF );
1419	}
1420	}
1421	return (sbits64) LIT64( 0x8000000000000000 );
1422	}
1423	else if ( aExp <= 0x7E ) {
1424	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1425	return 0;
1426	}
1427	aSig64 = aSig \| 0x00800000;
1428	aSig64 <<= 40;
1429	z = aSig64>>( - shiftCount );
1430	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431	STATUS(float_exception_flags) \|= float_flag_inexact;
1432	}
1433	if ( aSign ) z = - z;
1434	return z;
1435
1436	}
1437
1438	/*----------------------------------------------------------------------------
1439	\| Returns the result of converting the single-precision floating-point value
1440	\| `a' to the double-precision floating-point format. The conversion is
1441	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442	\| Arithmetic.
1443	----------------------------------------------------------------------------/
1444
1445	float64 float32_to_float64( float32 a STATUS_PARAM )
1446	{
1447	flag aSign;
1448	int16 aExp;
1449	bits32 aSig;
1450
1451	aSig = extractFloat32Frac( a );
1452	aExp = extractFloat32Exp( a );
1453	aSign = extractFloat32Sign( a );
1454	if ( aExp == 0xFF ) {
1455	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456	return packFloat64( aSign, 0x7FF, 0 );
1457	}
1458	if ( aExp == 0 ) {
1459	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461	--aExp;
1462	}
1463	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465	}
1466
1467	#ifdef FLOATX80
1468
1469	/*----------------------------------------------------------------------------
1470	\| Returns the result of converting the single-precision floating-point value
1471	\| `a' to the extended double-precision floating-point format. The conversion
1472	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473	\| Arithmetic.
1474	----------------------------------------------------------------------------/
1475
1476	floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477	{
1478	flag aSign;
1479	int16 aExp;
1480	bits32 aSig;
1481
1482	aSig = extractFloat32Frac( a );
1483	aExp = extractFloat32Exp( a );
1484	aSign = extractFloat32Sign( a );
1485	if ( aExp == 0xFF ) {
1486	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488	}
1489	if ( aExp == 0 ) {
1490	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492	}
1493	aSig \|= 0x00800000;
1494	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496	}
1497
1498	#endif
1499
1500	#ifdef FLOAT128
1501
1502	/*----------------------------------------------------------------------------
1503	\| Returns the result of converting the single-precision floating-point value
1504	\| `a' to the double-precision floating-point format. The conversion is
1505	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506	\| Arithmetic.
1507	----------------------------------------------------------------------------/
1508
1509	float128 float32_to_float128( float32 a STATUS_PARAM )
1510	{
1511	flag aSign;
1512	int16 aExp;
1513	bits32 aSig;
1514
1515	aSig = extractFloat32Frac( a );
1516	aExp = extractFloat32Exp( a );
1517	aSign = extractFloat32Sign( a );
1518	if ( aExp == 0xFF ) {
1519	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520	return packFloat128( aSign, 0x7FFF, 0, 0 );
1521	}
1522	if ( aExp == 0 ) {
1523	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525	--aExp;
1526	}
1527	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529	}
1530
1531	#endif
1532
1533	/*----------------------------------------------------------------------------
1534	\| Rounds the single-precision floating-point value `a' to an integer, and
1535	\| returns the result as a single-precision floating-point value. The
1536	\| operation is performed according to the IEC/IEEE Standard for Binary
1537	\| Floating-Point Arithmetic.
1538	----------------------------------------------------------------------------/
1539
1540	float32 float32_round_to_int( float32 a STATUS_PARAM)
1541	{
1542	flag aSign;
1543	int16 aExp;
1544	bits32 lastBitMask, roundBitsMask;
1545	int8 roundingMode;
1546	bits32 z;
1547
1548	aExp = extractFloat32Exp( a );
1549	if ( 0x96 <= aExp ) {
1550	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551	return propagateFloat32NaN( a, a STATUS_VAR );
1552	}
1553	return a;
1554	}
1555	if ( aExp <= 0x7E ) {
1556	if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557	STATUS(float_exception_flags) \|= float_flag_inexact;
1558	aSign = extractFloat32Sign( a );
1559	switch ( STATUS(float_rounding_mode) ) {
1560	case float_round_nearest_even:
1561	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562	return packFloat32( aSign, 0x7F, 0 );
1563	}
1564	break;
1565	case float_round_down:
1566	return make_float32(aSign ? 0xBF800000 : 0);
1567	case float_round_up:
1568	return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569	}
1570	return packFloat32( aSign, 0, 0 );
1571	}
1572	lastBitMask = 1;
1573	lastBitMask <<= 0x96 - aExp;
1574	roundBitsMask = lastBitMask - 1;
1575	z = float32_val(a);
1576	roundingMode = STATUS(float_rounding_mode);
1577	if ( roundingMode == float_round_nearest_even ) {
1578	z += lastBitMask>>1;
1579	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580	}
1581	else if ( roundingMode != float_round_to_zero ) {
1582	if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583	z += roundBitsMask;
1584	}
1585	}
1586	z &= ~ roundBitsMask;
1587	if ( z != float32_val(a) ) STATUS(float_exception_flags) \|= float_flag_inexact;
1588	return make_float32(z);
1589
1590	}
1591
1592	/*----------------------------------------------------------------------------
1593	\| Returns the result of adding the absolute values of the single-precision
1594	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595	\| before being returned. `zSign' is ignored if the result is a NaN.
1596	\| The addition is performed according to the IEC/IEEE Standard for Binary
1597	\| Floating-Point Arithmetic.
1598	----------------------------------------------------------------------------/
1599
1600	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601	{
1602	int16 aExp, bExp, zExp;
1603	bits32 aSig, bSig, zSig;
1604	int16 expDiff;
1605
1606	aSig = extractFloat32Frac( a );
1607	aExp = extractFloat32Exp( a );
1608	bSig = extractFloat32Frac( b );
1609	bExp = extractFloat32Exp( b );
1610	expDiff = aExp - bExp;
1611	aSig <<= 6;
1612	bSig <<= 6;
1613	if ( 0 < expDiff ) {
1614	if ( aExp == 0xFF ) {
1615	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616	return a;
1617	}
1618	if ( bExp == 0 ) {
1619	--expDiff;
1620	}
1621	else {
1622	bSig \|= 0x20000000;
1623	}
1624	shift32RightJamming( bSig, expDiff, &bSig );
1625	zExp = aExp;
1626	}
1627	else if ( expDiff < 0 ) {
1628	if ( bExp == 0xFF ) {
1629	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630	return packFloat32( zSign, 0xFF, 0 );
1631	}
1632	if ( aExp == 0 ) {
1633	++expDiff;
1634	}
1635	else {
1636	aSig \|= 0x20000000;
1637	}
1638	shift32RightJamming( aSig, - expDiff, &aSig );
1639	zExp = bExp;
1640	}
1641	else {
1642	if ( aExp == 0xFF ) {
1643	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644	return a;
1645	}
1646	if ( aExp == 0 ) {
1647	if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648	return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649	}
1650	zSig = 0x40000000 + aSig + bSig;
1651	zExp = aExp;
1652	goto roundAndPack;
1653	}
1654	aSig \|= 0x20000000;
1655	zSig = ( aSig + bSig )<<1;
1656	--zExp;
1657	if ( (sbits32) zSig < 0 ) {
1658	zSig = aSig + bSig;
1659	++zExp;
1660	}
1661	roundAndPack:
1662	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664	}
1665
1666	/*----------------------------------------------------------------------------
1667	\| Returns the result of subtracting the absolute values of the single-
1668	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
1669	\| difference is negated before being returned. `zSign' is ignored if the
1670	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
1671	\| Standard for Binary Floating-Point Arithmetic.
1672	----------------------------------------------------------------------------/
1673
1674	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675	{
1676	int16 aExp, bExp, zExp;
1677	bits32 aSig, bSig, zSig;
1678	int16 expDiff;
1679
1680	aSig = extractFloat32Frac( a );
1681	aExp = extractFloat32Exp( a );
1682	bSig = extractFloat32Frac( b );
1683	bExp = extractFloat32Exp( b );
1684	expDiff = aExp - bExp;
1685	aSig <<= 7;
1686	bSig <<= 7;
1687	if ( 0 < expDiff ) goto aExpBigger;
1688	if ( expDiff < 0 ) goto bExpBigger;
1689	if ( aExp == 0xFF ) {
1690	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691	float_raise( float_flag_invalid STATUS_VAR);
1692	return float32_default_nan;
1693	}
1694	if ( aExp == 0 ) {
1695	aExp = 1;
1696	bExp = 1;
1697	}
1698	if ( bSig < aSig ) goto aBigger;
1699	if ( aSig < bSig ) goto bBigger;
1700	return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701	bExpBigger:
1702	if ( bExp == 0xFF ) {
1703	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704	return packFloat32( zSign ^ 1, 0xFF, 0 );
1705	}
1706	if ( aExp == 0 ) {
1707	++expDiff;
1708	}
1709	else {
1710	aSig \|= 0x40000000;
1711	}
1712	shift32RightJamming( aSig, - expDiff, &aSig );
1713	bSig \|= 0x40000000;
1714	bBigger:
1715	zSig = bSig - aSig;
1716	zExp = bExp;
1717	zSign ^= 1;
1718	goto normalizeRoundAndPack;
1719	aExpBigger:
1720	if ( aExp == 0xFF ) {
1721	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722	return a;
1723	}
1724	if ( bExp == 0 ) {
1725	--expDiff;
1726	}
1727	else {
1728	bSig \|= 0x40000000;
1729	}
1730	shift32RightJamming( bSig, expDiff, &bSig );
1731	aSig \|= 0x40000000;
1732	aBigger:
1733	zSig = aSig - bSig;
1734	zExp = aExp;
1735	normalizeRoundAndPack:
1736	--zExp;
1737	return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739	}
1740
1741	/*----------------------------------------------------------------------------
1742	\| Returns the result of adding the single-precision floating-point values `a'
1743	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
1744	\| Binary Floating-Point Arithmetic.
1745	----------------------------------------------------------------------------/
1746
1747	float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748	{
1749	flag aSign, bSign;
1750
1751	aSign = extractFloat32Sign( a );
1752	bSign = extractFloat32Sign( b );
1753	if ( aSign == bSign ) {
1754	return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755	}
1756	else {
1757	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758	}
1759
1760	}
1761
1762	/*----------------------------------------------------------------------------
1763	\| Returns the result of subtracting the single-precision floating-point values
1764	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765	\| for Binary Floating-Point Arithmetic.
1766	----------------------------------------------------------------------------/
1767
1768	float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769	{
1770	flag aSign, bSign;
1771
1772	aSign = extractFloat32Sign( a );
1773	bSign = extractFloat32Sign( b );
1774	if ( aSign == bSign ) {
1775	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776	}
1777	else {
1778	return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779	}
1780
1781	}
1782
1783	/*----------------------------------------------------------------------------
1784	\| Returns the result of multiplying the single-precision floating-point values
1785	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786	\| for Binary Floating-Point Arithmetic.
1787	----------------------------------------------------------------------------/
1788
1789	float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790	{
1791	flag aSign, bSign, zSign;
1792	int16 aExp, bExp, zExp;
1793	bits32 aSig, bSig;
1794	bits64 zSig64;
1795	bits32 zSig;
1796
1797	aSig = extractFloat32Frac( a );
1798	aExp = extractFloat32Exp( a );
1799	aSign = extractFloat32Sign( a );
1800	bSig = extractFloat32Frac( b );
1801	bExp = extractFloat32Exp( b );
1802	bSign = extractFloat32Sign( b );
1803	zSign = aSign ^ bSign;
1804	if ( aExp == 0xFF ) {
1805	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1806	return propagateFloat32NaN( a, b STATUS_VAR );
1807	}
1808	if ( ( bExp \| bSig ) == 0 ) {
1809	float_raise( float_flag_invalid STATUS_VAR);
1810	return float32_default_nan;
1811	}
1812	return packFloat32( zSign, 0xFF, 0 );
1813	}
1814	if ( bExp == 0xFF ) {
1815	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816	if ( ( aExp \| aSig ) == 0 ) {
1817	float_raise( float_flag_invalid STATUS_VAR);
1818	return float32_default_nan;
1819	}
1820	return packFloat32( zSign, 0xFF, 0 );
1821	}
1822	if ( aExp == 0 ) {
1823	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825	}
1826	if ( bExp == 0 ) {
1827	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829	}
1830	zExp = aExp + bExp - 0x7F;
1831	aSig = ( aSig \| 0x00800000 )<<7;
1832	bSig = ( bSig \| 0x00800000 )<<8;
1833	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834	zSig = zSig64;
1835	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836	zSig <<= 1;
1837	--zExp;
1838	}
1839	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841	}
1842
1843	/*----------------------------------------------------------------------------
1844	\| Returns the result of dividing the single-precision floating-point value `a'
1845	\| by the corresponding value `b'. The operation is performed according to the
1846	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847	----------------------------------------------------------------------------/
1848
1849	float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850	{
1851	flag aSign, bSign, zSign;
1852	int16 aExp, bExp, zExp;
1853	bits32 aSig, bSig, zSig;
1854
1855	aSig = extractFloat32Frac( a );
1856	aExp = extractFloat32Exp( a );
1857	aSign = extractFloat32Sign( a );
1858	bSig = extractFloat32Frac( b );
1859	bExp = extractFloat32Exp( b );
1860	bSign = extractFloat32Sign( b );
1861	zSign = aSign ^ bSign;
1862	if ( aExp == 0xFF ) {
1863	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864	if ( bExp == 0xFF ) {
1865	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866	float_raise( float_flag_invalid STATUS_VAR);
1867	return float32_default_nan;
1868	}
1869	return packFloat32( zSign, 0xFF, 0 );
1870	}
1871	if ( bExp == 0xFF ) {
1872	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873	return packFloat32( zSign, 0, 0 );
1874	}
1875	if ( bExp == 0 ) {
1876	if ( bSig == 0 ) {
1877	if ( ( aExp \| aSig ) == 0 ) {
1878	float_raise( float_flag_invalid STATUS_VAR);
1879	return float32_default_nan;
1880	}
1881	float_raise( float_flag_divbyzero STATUS_VAR);
1882	return packFloat32( zSign, 0xFF, 0 );
1883	}
1884	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885	}
1886	if ( aExp == 0 ) {
1887	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889	}
1890	zExp = aExp - bExp + 0x7D;
1891	aSig = ( aSig \| 0x00800000 )<<7;
1892	bSig = ( bSig \| 0x00800000 )<<8;
1893	if ( bSig <= ( aSig + aSig ) ) {
1894	aSig >>= 1;
1895	++zExp;
1896	}
1897	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898	if ( ( zSig & 0x3F ) == 0 ) {
1899	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900	}
1901	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903	}
1904
1905	/*----------------------------------------------------------------------------
1906	\| Returns the remainder of the single-precision floating-point value `a'
1907	\| with respect to the corresponding value `b'. The operation is performed
1908	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909	----------------------------------------------------------------------------/
1910
1911	float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912	{
1913	flag aSign, bSign, zSign;
1914	int16 aExp, bExp, expDiff;
1915	bits32 aSig, bSig;
1916	bits32 q;
1917	bits64 aSig64, bSig64, q64;
1918	bits32 alternateASig;
1919	sbits32 sigMean;
1920
1921	aSig = extractFloat32Frac( a );
1922	aExp = extractFloat32Exp( a );
1923	aSign = extractFloat32Sign( a );
1924	bSig = extractFloat32Frac( b );
1925	bExp = extractFloat32Exp( b );
1926	bSign = extractFloat32Sign( b );
1927	if ( aExp == 0xFF ) {
1928	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1929	return propagateFloat32NaN( a, b STATUS_VAR );
1930	}
1931	float_raise( float_flag_invalid STATUS_VAR);
1932	return float32_default_nan;
1933	}
1934	if ( bExp == 0xFF ) {
1935	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936	return a;
1937	}
1938	if ( bExp == 0 ) {
1939	if ( bSig == 0 ) {
1940	float_raise( float_flag_invalid STATUS_VAR);
1941	return float32_default_nan;
1942	}
1943	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944	}
1945	if ( aExp == 0 ) {
1946	if ( aSig == 0 ) return a;
1947	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948	}
1949	expDiff = aExp - bExp;
1950	aSig \|= 0x00800000;
1951	bSig \|= 0x00800000;
1952	if ( expDiff < 32 ) {
1953	aSig <<= 8;
1954	bSig <<= 8;
1955	if ( expDiff < 0 ) {
1956	if ( expDiff < -1 ) return a;
1957	aSig >>= 1;
1958	}
1959	q = ( bSig <= aSig );
1960	if ( q ) aSig -= bSig;
1961	if ( 0 < expDiff ) {
1962	q = ( ( (bits64) aSig )<<32 ) / bSig;
1963	q >>= 32 - expDiff;
1964	bSig >>= 2;
1965	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966	}
1967	else {
1968	aSig >>= 2;
1969	bSig >>= 2;
1970	}
1971	}
1972	else {
1973	if ( bSig <= aSig ) aSig -= bSig;
1974	aSig64 = ( (bits64) aSig )<<40;
1975	bSig64 = ( (bits64) bSig )<<40;
1976	expDiff -= 64;
1977	while ( 0 < expDiff ) {
1978	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980	aSig64 = - ( ( bSig * q64 )<<38 );
1981	expDiff -= 62;
1982	}
1983	expDiff += 64;
1984	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986	q = q64>>( 64 - expDiff );
1987	bSig <<= 6;
1988	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989	}
1990	do {
1991	alternateASig = aSig;
1992	++q;
1993	aSig -= bSig;
1994	} while ( 0 <= (sbits32) aSig );
1995	sigMean = aSig + alternateASig;
1996	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997	aSig = alternateASig;
1998	}
1999	zSign = ( (sbits32) aSig < 0 );
2000	if ( zSign ) aSig = - aSig;
2001	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002
2003	}
2004
2005	/*----------------------------------------------------------------------------
2006	\| Returns the square root of the single-precision floating-point value `a'.
2007	\| The operation is performed according to the IEC/IEEE Standard for Binary
2008	\| Floating-Point Arithmetic.
2009	----------------------------------------------------------------------------/
2010
2011	float32 float32_sqrt( float32 a STATUS_PARAM )
2012	{
2013	flag aSign;
2014	int16 aExp, zExp;
2015	bits32 aSig, zSig;
2016	bits64 rem, term;
2017
2018	aSig = extractFloat32Frac( a );
2019	aExp = extractFloat32Exp( a );
2020	aSign = extractFloat32Sign( a );
2021	if ( aExp == 0xFF ) {
2022	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023	if ( ! aSign ) return a;
2024	float_raise( float_flag_invalid STATUS_VAR);
2025	return float32_default_nan;
2026	}
2027	if ( aSign ) {
2028	if ( ( aExp \| aSig ) == 0 ) return a;
2029	float_raise( float_flag_invalid STATUS_VAR);
2030	return float32_default_nan;
2031	}
2032	if ( aExp == 0 ) {
2033	if ( aSig == 0 ) return float32_zero;
2034	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035	}
2036	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037	aSig = ( aSig \| 0x00800000 )<<8;
2038	zSig = estimateSqrt32( aExp, aSig ) + 2;
2039	if ( ( zSig & 0x7F ) <= 5 ) {
2040	if ( zSig < 2 ) {
2041	zSig = 0x7FFFFFFF;
2042	goto roundAndPack;
2043	}
2044	aSig >>= aExp & 1;
2045	term = ( (bits64) zSig ) * zSig;
2046	rem = ( ( (bits64) aSig )<<32 ) - term;
2047	while ( (sbits64) rem < 0 ) {
2048	--zSig;
2049	rem += ( ( (bits64) zSig )<<1 ) \| 1;
2050	}
2051	zSig \|= ( rem != 0 );
2052	}
2053	shift32RightJamming( zSig, 1, &zSig );
2054	roundAndPack:
2055	return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056
2057	}
2058
2059	/*----------------------------------------------------------------------------
2060	\| Returns the binary log of the single-precision floating-point value `a'.
2061	\| The operation is performed according to the IEC/IEEE Standard for Binary
2062	\| Floating-Point Arithmetic.
2063	----------------------------------------------------------------------------/
2064	float32 float32_log2( float32 a STATUS_PARAM )
2065	{
2066	flag aSign, zSign;
2067	int16 aExp;
2068	bits32 aSig, zSig, i;
2069
2070	aSig = extractFloat32Frac( a );
2071	aExp = extractFloat32Exp( a );
2072	aSign = extractFloat32Sign( a );
2073
2074	if ( aExp == 0 ) {
2075	if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2076	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077	}
2078	if ( aSign ) {
2079	float_raise( float_flag_invalid STATUS_VAR);
2080	return float32_default_nan;
2081	}
2082	if ( aExp == 0xFF ) {
2083	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2084	return a;
2085	}
2086
2087	aExp -= 0x7F;
2088	aSig \|= 0x00800000;
2089	zSign = aExp < 0;
2090	zSig = aExp << 23;
2091
2092	for (i = 1 << 22; i > 0; i >>= 1) {
2093	aSig = ( (bits64)aSig * aSig ) >> 23;
2094	if ( aSig & 0x01000000 ) {
2095	aSig >>= 1;
2096	zSig \|= i;
2097	}
2098	}
2099
2100	if ( zSign )
2101	zSig = -zSig;
2102
2103	return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2104	}
2105
2106	/*----------------------------------------------------------------------------
2107	\| Returns 1 if the single-precision floating-point value `a' is equal to
2108	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2109	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110	----------------------------------------------------------------------------/
2111
2112	int float32_eq( float32 a, float32 b STATUS_PARAM )
2113	{
2114
2115	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117	) {
2118	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2119	float_raise( float_flag_invalid STATUS_VAR);
2120	}
2121	return 0;
2122	}
2123	return ( float32_val(a) == float32_val(b) ) \|\|
2124	( (bits32) ( ( float32_val(a) \| float32_val(b) )<<1 ) == 0 );
2125
2126	}
2127
2128	/*----------------------------------------------------------------------------
2129	\| Returns 1 if the single-precision floating-point value `a' is less than
2130	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
2131	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132	\| Arithmetic.
2133	----------------------------------------------------------------------------/
2134
2135	int float32_le( float32 a, float32 b STATUS_PARAM )
2136	{
2137	flag aSign, bSign;
2138	bits32 av, bv;
2139
2140	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142	) {
2143	float_raise( float_flag_invalid STATUS_VAR);
2144	return 0;
2145	}
2146	aSign = extractFloat32Sign( a );
2147	bSign = extractFloat32Sign( b );
2148	av = float32_val(a);
2149	bv = float32_val(b);
2150	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2151	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2152
2153	}
2154
2155	/*----------------------------------------------------------------------------
2156	\| Returns 1 if the single-precision floating-point value `a' is less than
2157	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2158	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159	----------------------------------------------------------------------------/
2160
2161	int float32_lt( float32 a, float32 b STATUS_PARAM )
2162	{
2163	flag aSign, bSign;
2164	bits32 av, bv;
2165
2166	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2167	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168	) {
2169	float_raise( float_flag_invalid STATUS_VAR);
2170	return 0;
2171	}
2172	aSign = extractFloat32Sign( a );
2173	bSign = extractFloat32Sign( b );
2174	av = float32_val(a);
2175	bv = float32_val(b);
2176	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2177	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2178
2179	}
2180
2181	/*----------------------------------------------------------------------------
2182	\| Returns 1 if the single-precision floating-point value `a' is equal to
2183	\| the corresponding value `b', and 0 otherwise. The invalid exception is
2184	\| raised if either operand is a NaN. Otherwise, the comparison is performed
2185	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186	----------------------------------------------------------------------------/
2187
2188	int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189	{
2190	bits32 av, bv;
2191
2192	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194	) {
2195	float_raise( float_flag_invalid STATUS_VAR);
2196	return 0;
2197	}
2198	av = float32_val(a);
2199	bv = float32_val(b);
2200	return ( av == bv ) \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2201
2202	}
2203
2204	/*----------------------------------------------------------------------------
2205	\| Returns 1 if the single-precision floating-point value `a' is less than or
2206	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2207	\| cause an exception. Otherwise, the comparison is performed according to the
2208	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209	----------------------------------------------------------------------------/
2210
2211	int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212	{
2213	flag aSign, bSign;
2214	bits32 av, bv;
2215
2216	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218	) {
2219	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2220	float_raise( float_flag_invalid STATUS_VAR);
2221	}
2222	return 0;
2223	}
2224	aSign = extractFloat32Sign( a );
2225	bSign = extractFloat32Sign( b );
2226	av = float32_val(a);
2227	bv = float32_val(b);
2228	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2229	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2230
2231	}
2232
2233	/*----------------------------------------------------------------------------
2234	\| Returns 1 if the single-precision floating-point value `a' is less than
2235	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2236	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2237	\| Standard for Binary Floating-Point Arithmetic.
2238	----------------------------------------------------------------------------/
2239
2240	int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241	{
2242	flag aSign, bSign;
2243	bits32 av, bv;
2244
2245	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2246	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247	) {
2248	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2249	float_raise( float_flag_invalid STATUS_VAR);
2250	}
2251	return 0;
2252	}
2253	aSign = extractFloat32Sign( a );
2254	bSign = extractFloat32Sign( b );
2255	av = float32_val(a);
2256	bv = float32_val(b);
2257	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2258	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259
2260	}
2261
2262	/*----------------------------------------------------------------------------
2263	\| Returns the result of converting the double-precision floating-point value
2264	\| `a' to the 32-bit two's complement integer format. The conversion is
2265	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2266	\| Arithmetic---which means in particular that the conversion is rounded
2267	\| according to the current rounding mode. If `a' is a NaN, the largest
2268	\| positive integer is returned. Otherwise, if the conversion overflows, the
2269	\| largest integer with the same sign as `a' is returned.
2270	----------------------------------------------------------------------------/
2271
2272	int32 float64_to_int32( float64 a STATUS_PARAM )
2273	{
2274	flag aSign;
2275	int16 aExp, shiftCount;
2276	bits64 aSig;
2277
2278	aSig = extractFloat64Frac( a );
2279	aExp = extractFloat64Exp( a );
2280	aSign = extractFloat64Sign( a );
2281	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2282	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2283	shiftCount = 0x42C - aExp;
2284	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2285	return roundAndPackInt32( aSign, aSig STATUS_VAR );
2286
2287	}
2288
2289	/*----------------------------------------------------------------------------
2290	\| Returns the result of converting the double-precision floating-point value
2291	\| `a' to the 32-bit two's complement integer format. The conversion is
2292	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2293	\| Arithmetic, except that the conversion is always rounded toward zero.
2294	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2295	\| the conversion overflows, the largest integer with the same sign as `a' is
2296	\| returned.
2297	----------------------------------------------------------------------------/
2298
2299	int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300	{
2301	flag aSign;
2302	int16 aExp, shiftCount;
2303	bits64 aSig, savedASig;
2304	int32 z;
2305
2306	aSig = extractFloat64Frac( a );
2307	aExp = extractFloat64Exp( a );
2308	aSign = extractFloat64Sign( a );
2309	if ( 0x41E < aExp ) {
2310	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2311	goto invalid;
2312	}
2313	else if ( aExp < 0x3FF ) {
2314	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2315	return 0;
2316	}
2317	aSig \|= LIT64( 0x0010000000000000 );
2318	shiftCount = 0x433 - aExp;
2319	savedASig = aSig;
2320	aSig >>= shiftCount;
2321	z = aSig;
2322	if ( aSign ) z = - z;
2323	if ( ( z < 0 ) ^ aSign ) {
2324	invalid:
2325	float_raise( float_flag_invalid STATUS_VAR);
2326	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327	}
2328	if ( ( aSig<<shiftCount ) != savedASig ) {
2329	STATUS(float_exception_flags) \|= float_flag_inexact;
2330	}
2331	return z;
2332
2333	}
2334
2335	/*----------------------------------------------------------------------------
2336	\| Returns the result of converting the double-precision floating-point value
2337	\| `a' to the 64-bit two's complement integer format. The conversion is
2338	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2339	\| Arithmetic---which means in particular that the conversion is rounded
2340	\| according to the current rounding mode. If `a' is a NaN, the largest
2341	\| positive integer is returned. Otherwise, if the conversion overflows, the
2342	\| largest integer with the same sign as `a' is returned.
2343	----------------------------------------------------------------------------/
2344
2345	int64 float64_to_int64( float64 a STATUS_PARAM )
2346	{
2347	flag aSign;
2348	int16 aExp, shiftCount;
2349	bits64 aSig, aSigExtra;
2350
2351	aSig = extractFloat64Frac( a );
2352	aExp = extractFloat64Exp( a );
2353	aSign = extractFloat64Sign( a );
2354	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2355	shiftCount = 0x433 - aExp;
2356	if ( shiftCount <= 0 ) {
2357	if ( 0x43E < aExp ) {
2358	float_raise( float_flag_invalid STATUS_VAR);
2359	if ( ! aSign
2360	\|\| ( ( aExp == 0x7FF )
2361	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2362	) {
2363	return LIT64( 0x7FFFFFFFFFFFFFFF );
2364	}
2365	return (sbits64) LIT64( 0x8000000000000000 );
2366	}
2367	aSigExtra = 0;
2368	aSig <<= - shiftCount;
2369	}
2370	else {
2371	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372	}
2373	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2374
2375	}
2376
2377	/*----------------------------------------------------------------------------
2378	\| Returns the result of converting the double-precision floating-point value
2379	\| `a' to the 64-bit two's complement integer format. The conversion is
2380	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2381	\| Arithmetic, except that the conversion is always rounded toward zero.
2382	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2383	\| the conversion overflows, the largest integer with the same sign as `a' is
2384	\| returned.
2385	----------------------------------------------------------------------------/
2386
2387	int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388	{
2389	flag aSign;
2390	int16 aExp, shiftCount;
2391	bits64 aSig;
2392	int64 z;
2393
2394	aSig = extractFloat64Frac( a );
2395	aExp = extractFloat64Exp( a );
2396	aSign = extractFloat64Sign( a );
2397	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2398	shiftCount = aExp - 0x433;
2399	if ( 0 <= shiftCount ) {
2400	if ( 0x43E <= aExp ) {
2401	if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2402	float_raise( float_flag_invalid STATUS_VAR);
2403	if ( ! aSign
2404	\|\| ( ( aExp == 0x7FF )
2405	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2406	) {
2407	return LIT64( 0x7FFFFFFFFFFFFFFF );
2408	}
2409	}
2410	return (sbits64) LIT64( 0x8000000000000000 );
2411	}
2412	z = aSig<<shiftCount;
2413	}
2414	else {
2415	if ( aExp < 0x3FE ) {
2416	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2417	return 0;
2418	}
2419	z = aSig>>( - shiftCount );
2420	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2421	STATUS(float_exception_flags) \|= float_flag_inexact;
2422	}
2423	}
2424	if ( aSign ) z = - z;
2425	return z;
2426
2427	}
2428
2429	/*----------------------------------------------------------------------------
2430	\| Returns the result of converting the double-precision floating-point value
2431	\| `a' to the single-precision floating-point format. The conversion is
2432	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2433	\| Arithmetic.
2434	----------------------------------------------------------------------------/
2435
2436	float32 float64_to_float32( float64 a STATUS_PARAM )
2437	{
2438	flag aSign;
2439	int16 aExp;
2440	bits64 aSig;
2441	bits32 zSig;
2442
2443	aSig = extractFloat64Frac( a );
2444	aExp = extractFloat64Exp( a );
2445	aSign = extractFloat64Sign( a );
2446	if ( aExp == 0x7FF ) {
2447	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2448	return packFloat32( aSign, 0xFF, 0 );
2449	}
2450	shift64RightJamming( aSig, 22, &aSig );
2451	zSig = aSig;
2452	if ( aExp \|\| zSig ) {
2453	zSig \|= 0x40000000;
2454	aExp -= 0x381;
2455	}
2456	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2457
2458	}
2459
2460	#ifdef FLOATX80
2461
2462	/*----------------------------------------------------------------------------
2463	\| Returns the result of converting the double-precision floating-point value
2464	\| `a' to the extended double-precision floating-point format. The conversion
2465	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2466	\| Arithmetic.
2467	----------------------------------------------------------------------------/
2468
2469	floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2470	{
2471	flag aSign;
2472	int16 aExp;
2473	bits64 aSig;
2474
2475	aSig = extractFloat64Frac( a );
2476	aExp = extractFloat64Exp( a );
2477	aSign = extractFloat64Sign( a );
2478	if ( aExp == 0x7FF ) {
2479	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2480	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2481	}
2482	if ( aExp == 0 ) {
2483	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2484	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2485	}
2486	return
2487	packFloatx80(
2488	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
2489
2490	}
2491
2492	#endif
2493
2494	#ifdef FLOAT128
2495
2496	/*----------------------------------------------------------------------------
2497	\| Returns the result of converting the double-precision floating-point value
2498	\| `a' to the quadruple-precision floating-point format. The conversion is
2499	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2500	\| Arithmetic.
2501	----------------------------------------------------------------------------/
2502
2503	float128 float64_to_float128( float64 a STATUS_PARAM )
2504	{
2505	flag aSign;
2506	int16 aExp;
2507	bits64 aSig, zSig0, zSig1;
2508
2509	aSig = extractFloat64Frac( a );
2510	aExp = extractFloat64Exp( a );
2511	aSign = extractFloat64Sign( a );
2512	if ( aExp == 0x7FF ) {
2513	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2514	return packFloat128( aSign, 0x7FFF, 0, 0 );
2515	}
2516	if ( aExp == 0 ) {
2517	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2518	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2519	--aExp;
2520	}
2521	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2522	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2523
2524	}
2525
2526	#endif
2527
2528	/*----------------------------------------------------------------------------
2529	\| Rounds the double-precision floating-point value `a' to an integer, and
2530	\| returns the result as a double-precision floating-point value. The
2531	\| operation is performed according to the IEC/IEEE Standard for Binary
2532	\| Floating-Point Arithmetic.
2533	----------------------------------------------------------------------------/
2534
2535	float64 float64_round_to_int( float64 a STATUS_PARAM )
2536	{
2537	flag aSign;
2538	int16 aExp;
2539	bits64 lastBitMask, roundBitsMask;
2540	int8 roundingMode;
2541	bits64 z;
2542
2543	aExp = extractFloat64Exp( a );
2544	if ( 0x433 <= aExp ) {
2545	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2546	return propagateFloat64NaN( a, a STATUS_VAR );
2547	}
2548	return a;
2549	}
2550	if ( aExp < 0x3FF ) {
2551	if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2552	STATUS(float_exception_flags) \|= float_flag_inexact;
2553	aSign = extractFloat64Sign( a );
2554	switch ( STATUS(float_rounding_mode) ) {
2555	case float_round_nearest_even:
2556	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2557	return packFloat64( aSign, 0x3FF, 0 );
2558	}
2559	break;
2560	case float_round_down:
2561	return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2562	case float_round_up:
2563	return make_float64(
2564	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2565	}
2566	return packFloat64( aSign, 0, 0 );
2567	}
2568	lastBitMask = 1;
2569	lastBitMask <<= 0x433 - aExp;
2570	roundBitsMask = lastBitMask - 1;
2571	z = float64_val(a);
2572	roundingMode = STATUS(float_rounding_mode);
2573	if ( roundingMode == float_round_nearest_even ) {
2574	z += lastBitMask>>1;
2575	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2576	}
2577	else if ( roundingMode != float_round_to_zero ) {
2578	if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2579	z += roundBitsMask;
2580	}
2581	}
2582	z &= ~ roundBitsMask;
2583	if ( z != float64_val(a) )
2584	STATUS(float_exception_flags) \|= float_flag_inexact;
2585	return make_float64(z);
2586
2587	}
2588
2589	float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2590	{
2591	int oldmode;
2592	float64 res;
2593	oldmode = STATUS(float_rounding_mode);
2594	STATUS(float_rounding_mode) = float_round_to_zero;
2595	res = float64_round_to_int(a STATUS_VAR);
2596	STATUS(float_rounding_mode) = oldmode;
2597	return res;
2598	}
2599
2600	/*----------------------------------------------------------------------------
2601	\| Returns the result of adding the absolute values of the double-precision
2602	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2603	\| before being returned. `zSign' is ignored if the result is a NaN.
2604	\| The addition is performed according to the IEC/IEEE Standard for Binary
2605	\| Floating-Point Arithmetic.
2606	----------------------------------------------------------------------------/
2607
2608	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2609	{
2610	int16 aExp, bExp, zExp;
2611	bits64 aSig, bSig, zSig;
2612	int16 expDiff;
2613
2614	aSig = extractFloat64Frac( a );
2615	aExp = extractFloat64Exp( a );
2616	bSig = extractFloat64Frac( b );
2617	bExp = extractFloat64Exp( b );
2618	expDiff = aExp - bExp;
2619	aSig <<= 9;
2620	bSig <<= 9;
2621	if ( 0 < expDiff ) {
2622	if ( aExp == 0x7FF ) {
2623	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624	return a;
2625	}
2626	if ( bExp == 0 ) {
2627	--expDiff;
2628	}
2629	else {
2630	bSig \|= LIT64( 0x2000000000000000 );
2631	}
2632	shift64RightJamming( bSig, expDiff, &bSig );
2633	zExp = aExp;
2634	}
2635	else if ( expDiff < 0 ) {
2636	if ( bExp == 0x7FF ) {
2637	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2638	return packFloat64( zSign, 0x7FF, 0 );
2639	}
2640	if ( aExp == 0 ) {
2641	++expDiff;
2642	}
2643	else {
2644	aSig \|= LIT64( 0x2000000000000000 );
2645	}
2646	shift64RightJamming( aSig, - expDiff, &aSig );
2647	zExp = bExp;
2648	}
2649	else {
2650	if ( aExp == 0x7FF ) {
2651	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652	return a;
2653	}
2654	if ( aExp == 0 ) {
2655	if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2656	return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2657	}
2658	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2659	zExp = aExp;
2660	goto roundAndPack;
2661	}
2662	aSig \|= LIT64( 0x2000000000000000 );
2663	zSig = ( aSig + bSig )<<1;
2664	--zExp;
2665	if ( (sbits64) zSig < 0 ) {
2666	zSig = aSig + bSig;
2667	++zExp;
2668	}
2669	roundAndPack:
2670	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2671
2672	}
2673
2674	/*----------------------------------------------------------------------------
2675	\| Returns the result of subtracting the absolute values of the double-
2676	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
2677	\| difference is negated before being returned. `zSign' is ignored if the
2678	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
2679	\| Standard for Binary Floating-Point Arithmetic.
2680	----------------------------------------------------------------------------/
2681
2682	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2683	{
2684	int16 aExp, bExp, zExp;
2685	bits64 aSig, bSig, zSig;
2686	int16 expDiff;
2687
2688	aSig = extractFloat64Frac( a );
2689	aExp = extractFloat64Exp( a );
2690	bSig = extractFloat64Frac( b );
2691	bExp = extractFloat64Exp( b );
2692	expDiff = aExp - bExp;
2693	aSig <<= 10;
2694	bSig <<= 10;
2695	if ( 0 < expDiff ) goto aExpBigger;
2696	if ( expDiff < 0 ) goto bExpBigger;
2697	if ( aExp == 0x7FF ) {
2698	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2699	float_raise( float_flag_invalid STATUS_VAR);
2700	return float64_default_nan;
2701	}
2702	if ( aExp == 0 ) {
2703	aExp = 1;
2704	bExp = 1;
2705	}
2706	if ( bSig < aSig ) goto aBigger;
2707	if ( aSig < bSig ) goto bBigger;
2708	return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2709	bExpBigger:
2710	if ( bExp == 0x7FF ) {
2711	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2712	return packFloat64( zSign ^ 1, 0x7FF, 0 );
2713	}
2714	if ( aExp == 0 ) {
2715	++expDiff;
2716	}
2717	else {
2718	aSig \|= LIT64( 0x4000000000000000 );
2719	}
2720	shift64RightJamming( aSig, - expDiff, &aSig );
2721	bSig \|= LIT64( 0x4000000000000000 );
2722	bBigger:
2723	zSig = bSig - aSig;
2724	zExp = bExp;
2725	zSign ^= 1;
2726	goto normalizeRoundAndPack;
2727	aExpBigger:
2728	if ( aExp == 0x7FF ) {
2729	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2730	return a;
2731	}
2732	if ( bExp == 0 ) {
2733	--expDiff;
2734	}
2735	else {
2736	bSig \|= LIT64( 0x4000000000000000 );
2737	}
2738	shift64RightJamming( bSig, expDiff, &bSig );
2739	aSig \|= LIT64( 0x4000000000000000 );
2740	aBigger:
2741	zSig = aSig - bSig;
2742	zExp = aExp;
2743	normalizeRoundAndPack:
2744	--zExp;
2745	return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2746
2747	}
2748
2749	/*----------------------------------------------------------------------------
2750	\| Returns the result of adding the double-precision floating-point values `a'
2751	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
2752	\| Binary Floating-Point Arithmetic.
2753	----------------------------------------------------------------------------/
2754
2755	float64 float64_add( float64 a, float64 b STATUS_PARAM )
2756	{
2757	flag aSign, bSign;
2758
2759	aSign = extractFloat64Sign( a );
2760	bSign = extractFloat64Sign( b );
2761	if ( aSign == bSign ) {
2762	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2763	}
2764	else {
2765	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2766	}
2767
2768	}
2769
2770	/*----------------------------------------------------------------------------
2771	\| Returns the result of subtracting the double-precision floating-point values
2772	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2773	\| for Binary Floating-Point Arithmetic.
2774	----------------------------------------------------------------------------/
2775
2776	float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2777	{
2778	flag aSign, bSign;
2779
2780	aSign = extractFloat64Sign( a );
2781	bSign = extractFloat64Sign( b );
2782	if ( aSign == bSign ) {
2783	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2784	}
2785	else {
2786	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2787	}
2788
2789	}
2790
2791	/*----------------------------------------------------------------------------
2792	\| Returns the result of multiplying the double-precision floating-point values
2793	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2794	\| for Binary Floating-Point Arithmetic.
2795	----------------------------------------------------------------------------/
2796
2797	float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2798	{
2799	flag aSign, bSign, zSign;
2800	int16 aExp, bExp, zExp;
2801	bits64 aSig, bSig, zSig0, zSig1;
2802
2803	aSig = extractFloat64Frac( a );
2804	aExp = extractFloat64Exp( a );
2805	aSign = extractFloat64Sign( a );
2806	bSig = extractFloat64Frac( b );
2807	bExp = extractFloat64Exp( b );
2808	bSign = extractFloat64Sign( b );
2809	zSign = aSign ^ bSign;
2810	if ( aExp == 0x7FF ) {
2811	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2812	return propagateFloat64NaN( a, b STATUS_VAR );
2813	}
2814	if ( ( bExp \| bSig ) == 0 ) {
2815	float_raise( float_flag_invalid STATUS_VAR);
2816	return float64_default_nan;
2817	}
2818	return packFloat64( zSign, 0x7FF, 0 );
2819	}
2820	if ( bExp == 0x7FF ) {
2821	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822	if ( ( aExp \| aSig ) == 0 ) {
2823	float_raise( float_flag_invalid STATUS_VAR);
2824	return float64_default_nan;
2825	}
2826	return packFloat64( zSign, 0x7FF, 0 );
2827	}
2828	if ( aExp == 0 ) {
2829	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2830	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2831	}
2832	if ( bExp == 0 ) {
2833	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2834	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835	}
2836	zExp = aExp + bExp - 0x3FF;
2837	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2838	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2839	mul64To128( aSig, bSig, &zSig0, &zSig1 );
2840	zSig0 \|= ( zSig1 != 0 );
2841	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2842	zSig0 <<= 1;
2843	--zExp;
2844	}
2845	return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2846
2847	}
2848
2849	/*----------------------------------------------------------------------------
2850	\| Returns the result of dividing the double-precision floating-point value `a'
2851	\| by the corresponding value `b'. The operation is performed according to
2852	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2853	----------------------------------------------------------------------------/
2854
2855	float64 float64_div( float64 a, float64 b STATUS_PARAM )
2856	{
2857	flag aSign, bSign, zSign;
2858	int16 aExp, bExp, zExp;
2859	bits64 aSig, bSig, zSig;
2860	bits64 rem0, rem1;
2861	bits64 term0, term1;
2862
2863	aSig = extractFloat64Frac( a );
2864	aExp = extractFloat64Exp( a );
2865	aSign = extractFloat64Sign( a );
2866	bSig = extractFloat64Frac( b );
2867	bExp = extractFloat64Exp( b );
2868	bSign = extractFloat64Sign( b );
2869	zSign = aSign ^ bSign;
2870	if ( aExp == 0x7FF ) {
2871	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2872	if ( bExp == 0x7FF ) {
2873	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2874	float_raise( float_flag_invalid STATUS_VAR);
2875	return float64_default_nan;
2876	}
2877	return packFloat64( zSign, 0x7FF, 0 );
2878	}
2879	if ( bExp == 0x7FF ) {
2880	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2881	return packFloat64( zSign, 0, 0 );
2882	}
2883	if ( bExp == 0 ) {
2884	if ( bSig == 0 ) {
2885	if ( ( aExp \| aSig ) == 0 ) {
2886	float_raise( float_flag_invalid STATUS_VAR);
2887	return float64_default_nan;
2888	}
2889	float_raise( float_flag_divbyzero STATUS_VAR);
2890	return packFloat64( zSign, 0x7FF, 0 );
2891	}
2892	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2893	}
2894	if ( aExp == 0 ) {
2895	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2896	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2897	}
2898	zExp = aExp - bExp + 0x3FD;
2899	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2900	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2901	if ( bSig <= ( aSig + aSig ) ) {
2902	aSig >>= 1;
2903	++zExp;
2904	}
2905	zSig = estimateDiv128To64( aSig, 0, bSig );
2906	if ( ( zSig & 0x1FF ) <= 2 ) {
2907	mul64To128( bSig, zSig, &term0, &term1 );
2908	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2909	while ( (sbits64) rem0 < 0 ) {
2910	--zSig;
2911	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2912	}
2913	zSig \|= ( rem1 != 0 );
2914	}
2915	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2916
2917	}
2918
2919	/*----------------------------------------------------------------------------
2920	\| Returns the remainder of the double-precision floating-point value `a'
2921	\| with respect to the corresponding value `b'. The operation is performed
2922	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923	----------------------------------------------------------------------------/
2924
2925	float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2926	{
2927	flag aSign, bSign, zSign;
2928	int16 aExp, bExp, expDiff;
2929	bits64 aSig, bSig;
2930	bits64 q, alternateASig;
2931	sbits64 sigMean;
2932
2933	aSig = extractFloat64Frac( a );
2934	aExp = extractFloat64Exp( a );
2935	aSign = extractFloat64Sign( a );
2936	bSig = extractFloat64Frac( b );
2937	bExp = extractFloat64Exp( b );
2938	bSign = extractFloat64Sign( b );
2939	if ( aExp == 0x7FF ) {
2940	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2941	return propagateFloat64NaN( a, b STATUS_VAR );
2942	}
2943	float_raise( float_flag_invalid STATUS_VAR);
2944	return float64_default_nan;
2945	}
2946	if ( bExp == 0x7FF ) {
2947	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2948	return a;
2949	}
2950	if ( bExp == 0 ) {
2951	if ( bSig == 0 ) {
2952	float_raise( float_flag_invalid STATUS_VAR);
2953	return float64_default_nan;
2954	}
2955	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956	}
2957	if ( aExp == 0 ) {
2958	if ( aSig == 0 ) return a;
2959	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960	}
2961	expDiff = aExp - bExp;
2962	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
2963	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2964	if ( expDiff < 0 ) {
2965	if ( expDiff < -1 ) return a;
2966	aSig >>= 1;
2967	}
2968	q = ( bSig <= aSig );
2969	if ( q ) aSig -= bSig;
2970	expDiff -= 64;
2971	while ( 0 < expDiff ) {
2972	q = estimateDiv128To64( aSig, 0, bSig );
2973	q = ( 2 < q ) ? q - 2 : 0;
2974	aSig = - ( ( bSig>>2 ) * q );
2975	expDiff -= 62;
2976	}
2977	expDiff += 64;
2978	if ( 0 < expDiff ) {
2979	q = estimateDiv128To64( aSig, 0, bSig );
2980	q = ( 2 < q ) ? q - 2 : 0;
2981	q >>= 64 - expDiff;
2982	bSig >>= 2;
2983	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2984	}
2985	else {
2986	aSig >>= 2;
2987	bSig >>= 2;
2988	}
2989	do {
2990	alternateASig = aSig;
2991	++q;
2992	aSig -= bSig;
2993	} while ( 0 <= (sbits64) aSig );
2994	sigMean = aSig + alternateASig;
2995	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2996	aSig = alternateASig;
2997	}
2998	zSign = ( (sbits64) aSig < 0 );
2999	if ( zSign ) aSig = - aSig;
3000	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3001
3002	}
3003
3004	/*----------------------------------------------------------------------------
3005	\| Returns the square root of the double-precision floating-point value `a'.
3006	\| The operation is performed according to the IEC/IEEE Standard for Binary
3007	\| Floating-Point Arithmetic.
3008	----------------------------------------------------------------------------/
3009
3010	float64 float64_sqrt( float64 a STATUS_PARAM )
3011	{
3012	flag aSign;
3013	int16 aExp, zExp;
3014	bits64 aSig, zSig, doubleZSig;
3015	bits64 rem0, rem1, term0, term1;
3016
3017	aSig = extractFloat64Frac( a );
3018	aExp = extractFloat64Exp( a );
3019	aSign = extractFloat64Sign( a );
3020	if ( aExp == 0x7FF ) {
3021	if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3022	if ( ! aSign ) return a;
3023	float_raise( float_flag_invalid STATUS_VAR);
3024	return float64_default_nan;
3025	}
3026	if ( aSign ) {
3027	if ( ( aExp \| aSig ) == 0 ) return a;
3028	float_raise( float_flag_invalid STATUS_VAR);
3029	return float64_default_nan;
3030	}
3031	if ( aExp == 0 ) {
3032	if ( aSig == 0 ) return float64_zero;
3033	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034	}
3035	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3036	aSig \|= LIT64( 0x0010000000000000 );
3037	zSig = estimateSqrt32( aExp, aSig>>21 );
3038	aSig <<= 9 - ( aExp & 1 );
3039	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3040	if ( ( zSig & 0x1FF ) <= 5 ) {
3041	doubleZSig = zSig<<1;
3042	mul64To128( zSig, zSig, &term0, &term1 );
3043	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3044	while ( (sbits64) rem0 < 0 ) {
3045	--zSig;
3046	doubleZSig -= 2;
3047	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
3048	}
3049	zSig \|= ( ( rem0 \| rem1 ) != 0 );
3050	}
3051	return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3052
3053	}
3054
3055	/*----------------------------------------------------------------------------
3056	\| Returns the binary log of the double-precision floating-point value `a'.
3057	\| The operation is performed according to the IEC/IEEE Standard for Binary
3058	\| Floating-Point Arithmetic.
3059	----------------------------------------------------------------------------/
3060	float64 float64_log2( float64 a STATUS_PARAM )
3061	{
3062	flag aSign, zSign;
3063	int16 aExp;
3064	bits64 aSig, aSig0, aSig1, zSig, i;
3065
3066	aSig = extractFloat64Frac( a );
3067	aExp = extractFloat64Exp( a );
3068	aSign = extractFloat64Sign( a );
3069
3070	if ( aExp == 0 ) {
3071	if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3072	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3073	}
3074	if ( aSign ) {
3075	float_raise( float_flag_invalid STATUS_VAR);
3076	return float64_default_nan;
3077	}
3078	if ( aExp == 0x7FF ) {
3079	if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3080	return a;
3081	}
3082
3083	aExp -= 0x3FF;
3084	aSig \|= LIT64( 0x0010000000000000 );
3085	zSign = aExp < 0;
3086	zSig = (bits64)aExp << 52;
3087	for (i = 1LL << 51; i > 0; i >>= 1) {
3088	mul64To128( aSig, aSig, &aSig0, &aSig1 );
3089	aSig = ( aSig0 << 12 ) \| ( aSig1 >> 52 );
3090	if ( aSig & LIT64( 0x0020000000000000 ) ) {
3091	aSig >>= 1;
3092	zSig \|= i;
3093	}
3094	}
3095
3096	if ( zSign )
3097	zSig = -zSig;
3098	return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3099	}
3100
3101	/*----------------------------------------------------------------------------
3102	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3103	\| corresponding value `b', and 0 otherwise. The comparison is performed
3104	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3105	----------------------------------------------------------------------------/
3106
3107	int float64_eq( float64 a, float64 b STATUS_PARAM )
3108	{
3109	bits64 av, bv;
3110
3111	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113	) {
3114	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3115	float_raise( float_flag_invalid STATUS_VAR);
3116	}
3117	return 0;
3118	}
3119	av = float64_val(a);
3120	bv = float64_val(b);
3121	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3122
3123	}
3124
3125	/*----------------------------------------------------------------------------
3126	\| Returns 1 if the double-precision floating-point value `a' is less than or
3127	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3128	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3129	\| Arithmetic.
3130	----------------------------------------------------------------------------/
3131
3132	int float64_le( float64 a, float64 b STATUS_PARAM )
3133	{
3134	flag aSign, bSign;
3135	bits64 av, bv;
3136
3137	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139	) {
3140	float_raise( float_flag_invalid STATUS_VAR);
3141	return 0;
3142	}
3143	aSign = extractFloat64Sign( a );
3144	bSign = extractFloat64Sign( b );
3145	av = float64_val(a);
3146	bv = float64_val(b);
3147	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3148	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3149
3150	}
3151
3152	/*----------------------------------------------------------------------------
3153	\| Returns 1 if the double-precision floating-point value `a' is less than
3154	\| the corresponding value `b', and 0 otherwise. The comparison is performed
3155	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3156	----------------------------------------------------------------------------/
3157
3158	int float64_lt( float64 a, float64 b STATUS_PARAM )
3159	{
3160	flag aSign, bSign;
3161	bits64 av, bv;
3162
3163	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3164	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3165	) {
3166	float_raise( float_flag_invalid STATUS_VAR);
3167	return 0;
3168	}
3169	aSign = extractFloat64Sign( a );
3170	bSign = extractFloat64Sign( b );
3171	av = float64_val(a);
3172	bv = float64_val(b);
3173	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3174	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3175
3176	}
3177
3178	/*----------------------------------------------------------------------------
3179	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3180	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
3181	\| if either operand is a NaN. Otherwise, the comparison is performed
3182	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3183	----------------------------------------------------------------------------/
3184
3185	int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3186	{
3187	bits64 av, bv;
3188
3189	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3190	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3191	) {
3192	float_raise( float_flag_invalid STATUS_VAR);
3193	return 0;
3194	}
3195	av = float64_val(a);
3196	bv = float64_val(b);
3197	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3198
3199	}
3200
3201	/*----------------------------------------------------------------------------
3202	\| Returns 1 if the double-precision floating-point value `a' is less than or
3203	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3204	\| cause an exception. Otherwise, the comparison is performed according to the
3205	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3206	----------------------------------------------------------------------------/
3207
3208	int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3209	{
3210	flag aSign, bSign;
3211	bits64 av, bv;
3212
3213	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3214	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3215	) {
3216	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3217	float_raise( float_flag_invalid STATUS_VAR);
3218	}
3219	return 0;
3220	}
3221	aSign = extractFloat64Sign( a );
3222	bSign = extractFloat64Sign( b );
3223	av = float64_val(a);
3224	bv = float64_val(b);
3225	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3226	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3227
3228	}
3229
3230	/*----------------------------------------------------------------------------
3231	\| Returns 1 if the double-precision floating-point value `a' is less than
3232	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3233	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3234	\| Standard for Binary Floating-Point Arithmetic.
3235	----------------------------------------------------------------------------/
3236
3237	int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3238	{
3239	flag aSign, bSign;
3240	bits64 av, bv;
3241
3242	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3243	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3244	) {
3245	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3246	float_raise( float_flag_invalid STATUS_VAR);
3247	}
3248	return 0;
3249	}
3250	aSign = extractFloat64Sign( a );
3251	bSign = extractFloat64Sign( b );
3252	av = float64_val(a);
3253	bv = float64_val(b);
3254	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3255	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3256
3257	}
3258
3259	#ifdef FLOATX80
3260
3261	/*----------------------------------------------------------------------------
3262	\| Returns the result of converting the extended double-precision floating-
3263	\| point value `a' to the 32-bit two's complement integer format. The
3264	\| conversion is performed according to the IEC/IEEE Standard for Binary
3265	\| Floating-Point Arithmetic---which means in particular that the conversion
3266	\| is rounded according to the current rounding mode. If `a' is a NaN, the
3267	\| largest positive integer is returned. Otherwise, if the conversion
3268	\| overflows, the largest integer with the same sign as `a' is returned.
3269	----------------------------------------------------------------------------/
3270
3271	int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3272	{
3273	flag aSign;
3274	int32 aExp, shiftCount;
3275	bits64 aSig;
3276
3277	aSig = extractFloatx80Frac( a );
3278	aExp = extractFloatx80Exp( a );
3279	aSign = extractFloatx80Sign( a );
3280	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3281	shiftCount = 0x4037 - aExp;
3282	if ( shiftCount <= 0 ) shiftCount = 1;
3283	shift64RightJamming( aSig, shiftCount, &aSig );
3284	return roundAndPackInt32( aSign, aSig STATUS_VAR );
3285
3286	}
3287
3288	/*----------------------------------------------------------------------------
3289	\| Returns the result of converting the extended double-precision floating-
3290	\| point value `a' to the 32-bit two's complement integer format. The
3291	\| conversion is performed according to the IEC/IEEE Standard for Binary
3292	\| Floating-Point Arithmetic, except that the conversion is always rounded
3293	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3294	\| Otherwise, if the conversion overflows, the largest integer with the same
3295	\| sign as `a' is returned.
3296	----------------------------------------------------------------------------/
3297
3298	int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3299	{
3300	flag aSign;
3301	int32 aExp, shiftCount;
3302	bits64 aSig, savedASig;
3303	int32 z;
3304
3305	aSig = extractFloatx80Frac( a );
3306	aExp = extractFloatx80Exp( a );
3307	aSign = extractFloatx80Sign( a );
3308	if ( 0x401E < aExp ) {
3309	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3310	goto invalid;
3311	}
3312	else if ( aExp < 0x3FFF ) {
3313	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3314	return 0;
3315	}
3316	shiftCount = 0x403E - aExp;
3317	savedASig = aSig;
3318	aSig >>= shiftCount;
3319	z = aSig;
3320	if ( aSign ) z = - z;
3321	if ( ( z < 0 ) ^ aSign ) {
3322	invalid:
3323	float_raise( float_flag_invalid STATUS_VAR);
3324	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3325	}
3326	if ( ( aSig<<shiftCount ) != savedASig ) {
3327	STATUS(float_exception_flags) \|= float_flag_inexact;
3328	}
3329	return z;
3330
3331	}
3332
3333	/*----------------------------------------------------------------------------
3334	\| Returns the result of converting the extended double-precision floating-
3335	\| point value `a' to the 64-bit two's complement integer format. The
3336	\| conversion is performed according to the IEC/IEEE Standard for Binary
3337	\| Floating-Point Arithmetic---which means in particular that the conversion
3338	\| is rounded according to the current rounding mode. If `a' is a NaN,
3339	\| the largest positive integer is returned. Otherwise, if the conversion
3340	\| overflows, the largest integer with the same sign as `a' is returned.
3341	----------------------------------------------------------------------------/
3342
3343	int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3344	{
3345	flag aSign;
3346	int32 aExp, shiftCount;
3347	bits64 aSig, aSigExtra;
3348
3349	aSig = extractFloatx80Frac( a );
3350	aExp = extractFloatx80Exp( a );
3351	aSign = extractFloatx80Sign( a );
3352	shiftCount = 0x403E - aExp;
3353	if ( shiftCount <= 0 ) {
3354	if ( shiftCount ) {
3355	float_raise( float_flag_invalid STATUS_VAR);
3356	if ( ! aSign
3357	\|\| ( ( aExp == 0x7FFF )
3358	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3359	) {
3360	return LIT64( 0x7FFFFFFFFFFFFFFF );
3361	}
3362	return (sbits64) LIT64( 0x8000000000000000 );
3363	}
3364	aSigExtra = 0;
3365	}
3366	else {
3367	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3368	}
3369	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3370
3371	}
3372
3373	/*----------------------------------------------------------------------------
3374	\| Returns the result of converting the extended double-precision floating-
3375	\| point value `a' to the 64-bit two's complement integer format. The
3376	\| conversion is performed according to the IEC/IEEE Standard for Binary
3377	\| Floating-Point Arithmetic, except that the conversion is always rounded
3378	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3379	\| Otherwise, if the conversion overflows, the largest integer with the same
3380	\| sign as `a' is returned.
3381	----------------------------------------------------------------------------/
3382
3383	int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3384	{
3385	flag aSign;
3386	int32 aExp, shiftCount;
3387	bits64 aSig;
3388	int64 z;
3389
3390	aSig = extractFloatx80Frac( a );
3391	aExp = extractFloatx80Exp( a );
3392	aSign = extractFloatx80Sign( a );
3393	shiftCount = aExp - 0x403E;
3394	if ( 0 <= shiftCount ) {
3395	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3396	if ( ( a.high != 0xC03E ) \|\| aSig ) {
3397	float_raise( float_flag_invalid STATUS_VAR);
3398	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
3399	return LIT64( 0x7FFFFFFFFFFFFFFF );
3400	}
3401	}
3402	return (sbits64) LIT64( 0x8000000000000000 );
3403	}
3404	else if ( aExp < 0x3FFF ) {
3405	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3406	return 0;
3407	}
3408	z = aSig>>( - shiftCount );
3409	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3410	STATUS(float_exception_flags) \|= float_flag_inexact;
3411	}
3412	if ( aSign ) z = - z;
3413	return z;
3414
3415	}
3416
3417	/*----------------------------------------------------------------------------
3418	\| Returns the result of converting the extended double-precision floating-
3419	\| point value `a' to the single-precision floating-point format. The
3420	\| conversion is performed according to the IEC/IEEE Standard for Binary
3421	\| Floating-Point Arithmetic.
3422	----------------------------------------------------------------------------/
3423
3424	float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3425	{
3426	flag aSign;
3427	int32 aExp;
3428	bits64 aSig;
3429
3430	aSig = extractFloatx80Frac( a );
3431	aExp = extractFloatx80Exp( a );
3432	aSign = extractFloatx80Sign( a );
3433	if ( aExp == 0x7FFF ) {
3434	if ( (bits64) ( aSig<<1 ) ) {
3435	return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3436	}
3437	return packFloat32( aSign, 0xFF, 0 );
3438	}
3439	shift64RightJamming( aSig, 33, &aSig );
3440	if ( aExp \|\| aSig ) aExp -= 0x3F81;
3441	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3442
3443	}
3444
3445	/*----------------------------------------------------------------------------
3446	\| Returns the result of converting the extended double-precision floating-
3447	\| point value `a' to the double-precision floating-point format. The
3448	\| conversion is performed according to the IEC/IEEE Standard for Binary
3449	\| Floating-Point Arithmetic.
3450	----------------------------------------------------------------------------/
3451
3452	float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3453	{
3454	flag aSign;
3455	int32 aExp;
3456	bits64 aSig, zSig;
3457
3458	aSig = extractFloatx80Frac( a );
3459	aExp = extractFloatx80Exp( a );
3460	aSign = extractFloatx80Sign( a );
3461	if ( aExp == 0x7FFF ) {
3462	if ( (bits64) ( aSig<<1 ) ) {
3463	return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3464	}
3465	return packFloat64( aSign, 0x7FF, 0 );
3466	}
3467	shift64RightJamming( aSig, 1, &zSig );
3468	if ( aExp \|\| aSig ) aExp -= 0x3C01;
3469	return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3470
3471	}
3472
3473	#ifdef FLOAT128
3474
3475	/*----------------------------------------------------------------------------
3476	\| Returns the result of converting the extended double-precision floating-
3477	\| point value `a' to the quadruple-precision floating-point format. The
3478	\| conversion is performed according to the IEC/IEEE Standard for Binary
3479	\| Floating-Point Arithmetic.
3480	----------------------------------------------------------------------------/
3481
3482	float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3483	{
3484	flag aSign;
3485	int16 aExp;
3486	bits64 aSig, zSig0, zSig1;
3487
3488	aSig = extractFloatx80Frac( a );
3489	aExp = extractFloatx80Exp( a );
3490	aSign = extractFloatx80Sign( a );
3491	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3492	return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3493	}
3494	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3495	return packFloat128( aSign, aExp, zSig0, zSig1 );
3496
3497	}
3498
3499	#endif
3500
3501	/*----------------------------------------------------------------------------
3502	\| Rounds the extended double-precision floating-point value `a' to an integer,
3503	\| and returns the result as an extended quadruple-precision floating-point
3504	\| value. The operation is performed according to the IEC/IEEE Standard for
3505	\| Binary Floating-Point Arithmetic.
3506	----------------------------------------------------------------------------/
3507
3508	floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3509	{
3510	flag aSign;
3511	int32 aExp;
3512	bits64 lastBitMask, roundBitsMask;
3513	int8 roundingMode;
3514	floatx80 z;
3515
3516	aExp = extractFloatx80Exp( a );
3517	if ( 0x403E <= aExp ) {
3518	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3519	return propagateFloatx80NaN( a, a STATUS_VAR );
3520	}
3521	return a;
3522	}
3523	if ( aExp < 0x3FFF ) {
3524	if ( ( aExp == 0 )
3525	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3526	return a;
3527	}
3528	STATUS(float_exception_flags) \|= float_flag_inexact;
3529	aSign = extractFloatx80Sign( a );
3530	switch ( STATUS(float_rounding_mode) ) {
3531	case float_round_nearest_even:
3532	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3533	) {
3534	return
3535	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536	}
3537	break;
3538	case float_round_down:
3539	return
3540	aSign ?
3541	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3542	: packFloatx80( 0, 0, 0 );
3543	case float_round_up:
3544	return
3545	aSign ? packFloatx80( 1, 0, 0 )
3546	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3547	}
3548	return packFloatx80( aSign, 0, 0 );
3549	}
3550	lastBitMask = 1;
3551	lastBitMask <<= 0x403E - aExp;
3552	roundBitsMask = lastBitMask - 1;
3553	z = a;
3554	roundingMode = STATUS(float_rounding_mode);
3555	if ( roundingMode == float_round_nearest_even ) {
3556	z.low += lastBitMask>>1;
3557	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3558	}
3559	else if ( roundingMode != float_round_to_zero ) {
3560	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3561	z.low += roundBitsMask;
3562	}
3563	}
3564	z.low &= ~ roundBitsMask;
3565	if ( z.low == 0 ) {
3566	++z.high;
3567	z.low = LIT64( 0x8000000000000000 );
3568	}
3569	if ( z.low != a.low ) STATUS(float_exception_flags) \|= float_flag_inexact;
3570	return z;
3571
3572	}
3573
3574	/*----------------------------------------------------------------------------
3575	\| Returns the result of adding the absolute values of the extended double-
3576	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3577	\| negated before being returned. `zSign' is ignored if the result is a NaN.
3578	\| The addition is performed according to the IEC/IEEE Standard for Binary
3579	\| Floating-Point Arithmetic.
3580	----------------------------------------------------------------------------/
3581
3582	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3583	{
3584	int32 aExp, bExp, zExp;
3585	bits64 aSig, bSig, zSig0, zSig1;
3586	int32 expDiff;
3587
3588	aSig = extractFloatx80Frac( a );
3589	aExp = extractFloatx80Exp( a );
3590	bSig = extractFloatx80Frac( b );
3591	bExp = extractFloatx80Exp( b );
3592	expDiff = aExp - bExp;
3593	if ( 0 < expDiff ) {
3594	if ( aExp == 0x7FFF ) {
3595	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3596	return a;
3597	}
3598	if ( bExp == 0 ) --expDiff;
3599	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3600	zExp = aExp;
3601	}
3602	else if ( expDiff < 0 ) {
3603	if ( bExp == 0x7FFF ) {
3604	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3605	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3606	}
3607	if ( aExp == 0 ) ++expDiff;
3608	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3609	zExp = bExp;
3610	}
3611	else {
3612	if ( aExp == 0x7FFF ) {
3613	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3614	return propagateFloatx80NaN( a, b STATUS_VAR );
3615	}
3616	return a;
3617	}
3618	zSig1 = 0;
3619	zSig0 = aSig + bSig;
3620	if ( aExp == 0 ) {
3621	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3622	goto roundAndPack;
3623	}
3624	zExp = aExp;
3625	goto shiftRight1;
3626	}
3627	zSig0 = aSig + bSig;
3628	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3629	shiftRight1:
3630	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631	zSig0 \|= LIT64( 0x8000000000000000 );
3632	++zExp;
3633	roundAndPack:
3634	return
3635	roundAndPackFloatx80(
3636	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3637
3638	}
3639
3640	/*----------------------------------------------------------------------------
3641	\| Returns the result of subtracting the absolute values of the extended
3642	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3643	\| difference is negated before being returned. `zSign' is ignored if the
3644	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
3645	\| Standard for Binary Floating-Point Arithmetic.
3646	----------------------------------------------------------------------------/
3647
3648	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3649	{
3650	int32 aExp, bExp, zExp;
3651	bits64 aSig, bSig, zSig0, zSig1;
3652	int32 expDiff;
3653	floatx80 z;
3654
3655	aSig = extractFloatx80Frac( a );
3656	aExp = extractFloatx80Exp( a );
3657	bSig = extractFloatx80Frac( b );
3658	bExp = extractFloatx80Exp( b );
3659	expDiff = aExp - bExp;
3660	if ( 0 < expDiff ) goto aExpBigger;
3661	if ( expDiff < 0 ) goto bExpBigger;
3662	if ( aExp == 0x7FFF ) {
3663	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3664	return propagateFloatx80NaN( a, b STATUS_VAR );
3665	}
3666	float_raise( float_flag_invalid STATUS_VAR);
3667	z.low = floatx80_default_nan_low;
3668	z.high = floatx80_default_nan_high;
3669	return z;
3670	}
3671	if ( aExp == 0 ) {
3672	aExp = 1;
3673	bExp = 1;
3674	}
3675	zSig1 = 0;
3676	if ( bSig < aSig ) goto aBigger;
3677	if ( aSig < bSig ) goto bBigger;
3678	return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3679	bExpBigger:
3680	if ( bExp == 0x7FFF ) {
3681	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3682	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683	}
3684	if ( aExp == 0 ) ++expDiff;
3685	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3686	bBigger:
3687	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3688	zExp = bExp;
3689	zSign ^= 1;
3690	goto normalizeRoundAndPack;
3691	aExpBigger:
3692	if ( aExp == 0x7FFF ) {
3693	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3694	return a;
3695	}
3696	if ( bExp == 0 ) --expDiff;
3697	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3698	aBigger:
3699	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3700	zExp = aExp;
3701	normalizeRoundAndPack:
3702	return
3703	normalizeRoundAndPackFloatx80(
3704	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705
3706	}
3707
3708	/*----------------------------------------------------------------------------
3709	\| Returns the result of adding the extended double-precision floating-point
3710	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
3711	\| Standard for Binary Floating-Point Arithmetic.
3712	----------------------------------------------------------------------------/
3713
3714	floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3715	{
3716	flag aSign, bSign;
3717
3718	aSign = extractFloatx80Sign( a );
3719	bSign = extractFloatx80Sign( b );
3720	if ( aSign == bSign ) {
3721	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3722	}
3723	else {
3724	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3725	}
3726
3727	}
3728
3729	/*----------------------------------------------------------------------------
3730	\| Returns the result of subtracting the extended double-precision floating-
3731	\| point values `a' and `b'. The operation is performed according to the
3732	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3733	----------------------------------------------------------------------------/
3734
3735	floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3736	{
3737	flag aSign, bSign;
3738
3739	aSign = extractFloatx80Sign( a );
3740	bSign = extractFloatx80Sign( b );
3741	if ( aSign == bSign ) {
3742	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3743	}
3744	else {
3745	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3746	}
3747
3748	}
3749
3750	/*----------------------------------------------------------------------------
3751	\| Returns the result of multiplying the extended double-precision floating-
3752	\| point values `a' and `b'. The operation is performed according to the
3753	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3754	----------------------------------------------------------------------------/
3755
3756	floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3757	{
3758	flag aSign, bSign, zSign;
3759	int32 aExp, bExp, zExp;
3760	bits64 aSig, bSig, zSig0, zSig1;
3761	floatx80 z;
3762
3763	aSig = extractFloatx80Frac( a );
3764	aExp = extractFloatx80Exp( a );
3765	aSign = extractFloatx80Sign( a );
3766	bSig = extractFloatx80Frac( b );
3767	bExp = extractFloatx80Exp( b );
3768	bSign = extractFloatx80Sign( b );
3769	zSign = aSign ^ bSign;
3770	if ( aExp == 0x7FFF ) {
3771	if ( (bits64) ( aSig<<1 )
3772	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3773	return propagateFloatx80NaN( a, b STATUS_VAR );
3774	}
3775	if ( ( bExp \| bSig ) == 0 ) goto invalid;
3776	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3777	}
3778	if ( bExp == 0x7FFF ) {
3779	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3780	if ( ( aExp \| aSig ) == 0 ) {
3781	invalid:
3782	float_raise( float_flag_invalid STATUS_VAR);
3783	z.low = floatx80_default_nan_low;
3784	z.high = floatx80_default_nan_high;
3785	return z;
3786	}
3787	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3788	}
3789	if ( aExp == 0 ) {
3790	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3791	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3792	}
3793	if ( bExp == 0 ) {
3794	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3795	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3796	}
3797	zExp = aExp + bExp - 0x3FFE;
3798	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3799	if ( 0 < (sbits64) zSig0 ) {
3800	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3801	--zExp;
3802	}
3803	return
3804	roundAndPackFloatx80(
3805	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3806
3807	}
3808
3809	/*----------------------------------------------------------------------------
3810	\| Returns the result of dividing the extended double-precision floating-point
3811	\| value `a' by the corresponding value `b'. The operation is performed
3812	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3813	----------------------------------------------------------------------------/
3814
3815	floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3816	{
3817	flag aSign, bSign, zSign;
3818	int32 aExp, bExp, zExp;
3819	bits64 aSig, bSig, zSig0, zSig1;
3820	bits64 rem0, rem1, rem2, term0, term1, term2;
3821	floatx80 z;
3822
3823	aSig = extractFloatx80Frac( a );
3824	aExp = extractFloatx80Exp( a );
3825	aSign = extractFloatx80Sign( a );
3826	bSig = extractFloatx80Frac( b );
3827	bExp = extractFloatx80Exp( b );
3828	bSign = extractFloatx80Sign( b );
3829	zSign = aSign ^ bSign;
3830	if ( aExp == 0x7FFF ) {
3831	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3832	if ( bExp == 0x7FFF ) {
3833	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3834	goto invalid;
3835	}
3836	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3837	}
3838	if ( bExp == 0x7FFF ) {
3839	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3840	return packFloatx80( zSign, 0, 0 );
3841	}
3842	if ( bExp == 0 ) {
3843	if ( bSig == 0 ) {
3844	if ( ( aExp \| aSig ) == 0 ) {
3845	invalid:
3846	float_raise( float_flag_invalid STATUS_VAR);
3847	z.low = floatx80_default_nan_low;
3848	z.high = floatx80_default_nan_high;
3849	return z;
3850	}
3851	float_raise( float_flag_divbyzero STATUS_VAR);
3852	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3853	}
3854	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3855	}
3856	if ( aExp == 0 ) {
3857	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3858	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3859	}
3860	zExp = aExp - bExp + 0x3FFE;
3861	rem1 = 0;
3862	if ( bSig <= aSig ) {
3863	shift128Right( aSig, 0, 1, &aSig, &rem1 );
3864	++zExp;
3865	}
3866	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3867	mul64To128( bSig, zSig0, &term0, &term1 );
3868	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3869	while ( (sbits64) rem0 < 0 ) {
3870	--zSig0;
3871	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3872	}
3873	zSig1 = estimateDiv128To64( rem1, 0, bSig );
3874	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3875	mul64To128( bSig, zSig1, &term1, &term2 );
3876	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3877	while ( (sbits64) rem1 < 0 ) {
3878	--zSig1;
3879	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3880	}
3881	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
3882	}
3883	return
3884	roundAndPackFloatx80(
3885	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3886
3887	}
3888
3889	/*----------------------------------------------------------------------------
3890	\| Returns the remainder of the extended double-precision floating-point value
3891	\| `a' with respect to the corresponding value `b'. The operation is performed
3892	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893	----------------------------------------------------------------------------/
3894
3895	floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3896	{
3897	flag aSign, bSign, zSign;
3898	int32 aExp, bExp, expDiff;
3899	bits64 aSig0, aSig1, bSig;
3900	bits64 q, term0, term1, alternateASig0, alternateASig1;
3901	floatx80 z;
3902
3903	aSig0 = extractFloatx80Frac( a );
3904	aExp = extractFloatx80Exp( a );
3905	aSign = extractFloatx80Sign( a );
3906	bSig = extractFloatx80Frac( b );
3907	bExp = extractFloatx80Exp( b );
3908	bSign = extractFloatx80Sign( b );
3909	if ( aExp == 0x7FFF ) {
3910	if ( (bits64) ( aSig0<<1 )
3911	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3912	return propagateFloatx80NaN( a, b STATUS_VAR );
3913	}
3914	goto invalid;
3915	}
3916	if ( bExp == 0x7FFF ) {
3917	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3918	return a;
3919	}
3920	if ( bExp == 0 ) {
3921	if ( bSig == 0 ) {
3922	invalid:
3923	float_raise( float_flag_invalid STATUS_VAR);
3924	z.low = floatx80_default_nan_low;
3925	z.high = floatx80_default_nan_high;
3926	return z;
3927	}
3928	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3929	}
3930	if ( aExp == 0 ) {
3931	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3932	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3933	}
3934	bSig \|= LIT64( 0x8000000000000000 );
3935	zSign = aSign;
3936	expDiff = aExp - bExp;
3937	aSig1 = 0;
3938	if ( expDiff < 0 ) {
3939	if ( expDiff < -1 ) return a;
3940	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3941	expDiff = 0;
3942	}
3943	q = ( bSig <= aSig0 );
3944	if ( q ) aSig0 -= bSig;
3945	expDiff -= 64;
3946	while ( 0 < expDiff ) {
3947	q = estimateDiv128To64( aSig0, aSig1, bSig );
3948	q = ( 2 < q ) ? q - 2 : 0;
3949	mul64To128( bSig, q, &term0, &term1 );
3950	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3951	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3952	expDiff -= 62;
3953	}
3954	expDiff += 64;
3955	if ( 0 < expDiff ) {
3956	q = estimateDiv128To64( aSig0, aSig1, bSig );
3957	q = ( 2 < q ) ? q - 2 : 0;
3958	q >>= 64 - expDiff;
3959	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3960	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3962	while ( le128( term0, term1, aSig0, aSig1 ) ) {
3963	++q;
3964	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3965	}
3966	}
3967	else {
3968	term1 = 0;
3969	term0 = bSig;
3970	}
3971	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3972	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3973	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3974	&& ( q & 1 ) )
3975	) {
3976	aSig0 = alternateASig0;
3977	aSig1 = alternateASig1;
3978	zSign = ! zSign;
3979	}
3980	return
3981	normalizeRoundAndPackFloatx80(
3982	80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3983
3984	}
3985
3986	/*----------------------------------------------------------------------------
3987	\| Returns the square root of the extended double-precision floating-point
3988	\| value `a'. The operation is performed according to the IEC/IEEE Standard
3989	\| for Binary Floating-Point Arithmetic.
3990	----------------------------------------------------------------------------/
3991
3992	floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3993	{
3994	flag aSign;
3995	int32 aExp, zExp;
3996	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3997	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3998	floatx80 z;
3999
4000	aSig0 = extractFloatx80Frac( a );
4001	aExp = extractFloatx80Exp( a );
4002	aSign = extractFloatx80Sign( a );
4003	if ( aExp == 0x7FFF ) {
4004	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4005	if ( ! aSign ) return a;
4006	goto invalid;
4007	}
4008	if ( aSign ) {
4009	if ( ( aExp \| aSig0 ) == 0 ) return a;
4010	invalid:
4011	float_raise( float_flag_invalid STATUS_VAR);
4012	z.low = floatx80_default_nan_low;
4013	z.high = floatx80_default_nan_high;
4014	return z;
4015	}
4016	if ( aExp == 0 ) {
4017	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4018	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4019	}
4020	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4021	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4022	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4023	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4024	doubleZSig0 = zSig0<<1;
4025	mul64To128( zSig0, zSig0, &term0, &term1 );
4026	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4027	while ( (sbits64) rem0 < 0 ) {
4028	--zSig0;
4029	doubleZSig0 -= 2;
4030	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
4031	}
4032	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4033	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4034	if ( zSig1 == 0 ) zSig1 = 1;
4035	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4036	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4037	mul64To128( zSig1, zSig1, &term2, &term3 );
4038	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4039	while ( (sbits64) rem1 < 0 ) {
4040	--zSig1;
4041	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4042	term3 \|= 1;
4043	term2 \|= doubleZSig0;
4044	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4045	}
4046	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4047	}
4048	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4049	zSig0 \|= doubleZSig0;
4050	return
4051	roundAndPackFloatx80(
4052	STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4053
4054	}
4055
4056	/*----------------------------------------------------------------------------
4057	\| Returns 1 if the extended double-precision floating-point value `a' is
4058	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
4059	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
4060	\| Arithmetic.
4061	----------------------------------------------------------------------------/
4062
4063	int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4064	{
4065
4066	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4067	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4068	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4069	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4070	) {
4071	if ( floatx80_is_signaling_nan( a )
4072	\|\| floatx80_is_signaling_nan( b ) ) {
4073	float_raise( float_flag_invalid STATUS_VAR);
4074	}
4075	return 0;
4076	}
4077	return
4078	( a.low == b.low )
4079	&& ( ( a.high == b.high )
4080	\|\| ( ( a.low == 0 )
4081	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4082	);
4083
4084	}
4085
4086	/*----------------------------------------------------------------------------
4087	\| Returns 1 if the extended double-precision floating-point value `a' is
4088	\| less than or equal to the corresponding value `b', and 0 otherwise. The
4089	\| comparison is performed according to the IEC/IEEE Standard for Binary
4090	\| Floating-Point Arithmetic.
4091	----------------------------------------------------------------------------/
4092
4093	int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4094	{
4095	flag aSign, bSign;
4096
4097	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4098	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4099	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4100	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4101	) {
4102	float_raise( float_flag_invalid STATUS_VAR);
4103	return 0;
4104	}
4105	aSign = extractFloatx80Sign( a );
4106	bSign = extractFloatx80Sign( b );
4107	if ( aSign != bSign ) {
4108	return
4109	aSign
4110	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4111	== 0 );
4112	}
4113	return
4114	aSign ? le128( b.high, b.low, a.high, a.low )
4115	: le128( a.high, a.low, b.high, b.low );
4116
4117	}
4118
4119	/*----------------------------------------------------------------------------
4120	\| Returns 1 if the extended double-precision floating-point value `a' is
4121	\| less than the corresponding value `b', and 0 otherwise. The comparison
4122	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123	\| Arithmetic.
4124	----------------------------------------------------------------------------/
4125
4126	int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4127	{
4128	flag aSign, bSign;
4129
4130	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4131	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4133	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134	) {
4135	float_raise( float_flag_invalid STATUS_VAR);
4136	return 0;
4137	}
4138	aSign = extractFloatx80Sign( a );
4139	bSign = extractFloatx80Sign( b );
4140	if ( aSign != bSign ) {
4141	return
4142	aSign
4143	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4144	!= 0 );
4145	}
4146	return
4147	aSign ? lt128( b.high, b.low, a.high, a.low )
4148	: lt128( a.high, a.low, b.high, b.low );
4149
4150	}
4151
4152	/*----------------------------------------------------------------------------
4153	\| Returns 1 if the extended double-precision floating-point value `a' is equal
4154	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
4155	\| raised if either operand is a NaN. Otherwise, the comparison is performed
4156	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4157	----------------------------------------------------------------------------/
4158
4159	int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4160	{
4161
4162	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4163	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4164	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4165	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4166	) {
4167	float_raise( float_flag_invalid STATUS_VAR);
4168	return 0;
4169	}
4170	return
4171	( a.low == b.low )
4172	&& ( ( a.high == b.high )
4173	\|\| ( ( a.low == 0 )
4174	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4175	);
4176
4177	}
4178
4179	/*----------------------------------------------------------------------------
4180	\| Returns 1 if the extended double-precision floating-point value `a' is less
4181	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4182	\| do not cause an exception. Otherwise, the comparison is performed according
4183	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184	----------------------------------------------------------------------------/
4185
4186	int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4187	{
4188	flag aSign, bSign;
4189
4190	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4191	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4193	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4194	) {
4195	if ( floatx80_is_signaling_nan( a )
4196	\|\| floatx80_is_signaling_nan( b ) ) {
4197	float_raise( float_flag_invalid STATUS_VAR);
4198	}
4199	return 0;
4200	}
4201	aSign = extractFloatx80Sign( a );
4202	bSign = extractFloatx80Sign( b );
4203	if ( aSign != bSign ) {
4204	return
4205	aSign
4206	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4207	== 0 );
4208	}
4209	return
4210	aSign ? le128( b.high, b.low, a.high, a.low )
4211	: le128( a.high, a.low, b.high, b.low );
4212
4213	}
4214
4215	/*----------------------------------------------------------------------------
4216	\| Returns 1 if the extended double-precision floating-point value `a' is less
4217	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4218	\| an exception. Otherwise, the comparison is performed according to the
4219	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4220	----------------------------------------------------------------------------/
4221
4222	int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4223	{
4224	flag aSign, bSign;
4225
4226	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4227	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4228	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4229	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4230	) {
4231	if ( floatx80_is_signaling_nan( a )
4232	\|\| floatx80_is_signaling_nan( b ) ) {
4233	float_raise( float_flag_invalid STATUS_VAR);
4234	}
4235	return 0;
4236	}
4237	aSign = extractFloatx80Sign( a );
4238	bSign = extractFloatx80Sign( b );
4239	if ( aSign != bSign ) {
4240	return
4241	aSign
4242	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4243	!= 0 );
4244	}
4245	return
4246	aSign ? lt128( b.high, b.low, a.high, a.low )
4247	: lt128( a.high, a.low, b.high, b.low );
4248
4249	}
4250
4251	#endif
4252
4253	#ifdef FLOAT128
4254
4255	/*----------------------------------------------------------------------------
4256	\| Returns the result of converting the quadruple-precision floating-point
4257	\| value `a' to the 32-bit two's complement integer format. The conversion
4258	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259	\| Arithmetic---which means in particular that the conversion is rounded
4260	\| according to the current rounding mode. If `a' is a NaN, the largest
4261	\| positive integer is returned. Otherwise, if the conversion overflows, the
4262	\| largest integer with the same sign as `a' is returned.
4263	----------------------------------------------------------------------------/
4264
4265	int32 float128_to_int32( float128 a STATUS_PARAM )
4266	{
4267	flag aSign;
4268	int32 aExp, shiftCount;
4269	bits64 aSig0, aSig1;
4270
4271	aSig1 = extractFloat128Frac1( a );
4272	aSig0 = extractFloat128Frac0( a );
4273	aExp = extractFloat128Exp( a );
4274	aSign = extractFloat128Sign( a );
4275	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
4276	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4277	aSig0 \|= ( aSig1 != 0 );
4278	shiftCount = 0x4028 - aExp;
4279	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4280	return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4281
4282	}
4283
4284	/*----------------------------------------------------------------------------
4285	\| Returns the result of converting the quadruple-precision floating-point
4286	\| value `a' to the 32-bit two's complement integer format. The conversion
4287	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288	\| Arithmetic, except that the conversion is always rounded toward zero. If
4289	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4290	\| conversion overflows, the largest integer with the same sign as `a' is
4291	\| returned.
4292	----------------------------------------------------------------------------/
4293
4294	int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4295	{
4296	flag aSign;
4297	int32 aExp, shiftCount;
4298	bits64 aSig0, aSig1, savedASig;
4299	int32 z;
4300
4301	aSig1 = extractFloat128Frac1( a );
4302	aSig0 = extractFloat128Frac0( a );
4303	aExp = extractFloat128Exp( a );
4304	aSign = extractFloat128Sign( a );
4305	aSig0 \|= ( aSig1 != 0 );
4306	if ( 0x401E < aExp ) {
4307	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4308	goto invalid;
4309	}
4310	else if ( aExp < 0x3FFF ) {
4311	if ( aExp \|\| aSig0 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4312	return 0;
4313	}
4314	aSig0 \|= LIT64( 0x0001000000000000 );
4315	shiftCount = 0x402F - aExp;
4316	savedASig = aSig0;
4317	aSig0 >>= shiftCount;
4318	z = aSig0;
4319	if ( aSign ) z = - z;
4320	if ( ( z < 0 ) ^ aSign ) {
4321	invalid:
4322	float_raise( float_flag_invalid STATUS_VAR);
4323	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4324	}
4325	if ( ( aSig0<<shiftCount ) != savedASig ) {
4326	STATUS(float_exception_flags) \|= float_flag_inexact;
4327	}
4328	return z;
4329
4330	}
4331
4332	/*----------------------------------------------------------------------------
4333	\| Returns the result of converting the quadruple-precision floating-point
4334	\| value `a' to the 64-bit two's complement integer format. The conversion
4335	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4336	\| Arithmetic---which means in particular that the conversion is rounded
4337	\| according to the current rounding mode. If `a' is a NaN, the largest
4338	\| positive integer is returned. Otherwise, if the conversion overflows, the
4339	\| largest integer with the same sign as `a' is returned.
4340	----------------------------------------------------------------------------/
4341
4342	int64 float128_to_int64( float128 a STATUS_PARAM )
4343	{
4344	flag aSign;
4345	int32 aExp, shiftCount;
4346	bits64 aSig0, aSig1;
4347
4348	aSig1 = extractFloat128Frac1( a );
4349	aSig0 = extractFloat128Frac0( a );
4350	aExp = extractFloat128Exp( a );
4351	aSign = extractFloat128Sign( a );
4352	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4353	shiftCount = 0x402F - aExp;
4354	if ( shiftCount <= 0 ) {
4355	if ( 0x403E < aExp ) {
4356	float_raise( float_flag_invalid STATUS_VAR);
4357	if ( ! aSign
4358	\|\| ( ( aExp == 0x7FFF )
4359	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4360	)
4361	) {
4362	return LIT64( 0x7FFFFFFFFFFFFFFF );
4363	}
4364	return (sbits64) LIT64( 0x8000000000000000 );
4365	}
4366	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4367	}
4368	else {
4369	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4370	}
4371	return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4372
4373	}
4374
4375	/*----------------------------------------------------------------------------
4376	\| Returns the result of converting the quadruple-precision floating-point
4377	\| value `a' to the 64-bit two's complement integer format. The conversion
4378	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4379	\| Arithmetic, except that the conversion is always rounded toward zero.
4380	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4381	\| the conversion overflows, the largest integer with the same sign as `a' is
4382	\| returned.
4383	----------------------------------------------------------------------------/
4384
4385	int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4386	{
4387	flag aSign;
4388	int32 aExp, shiftCount;
4389	bits64 aSig0, aSig1;
4390	int64 z;
4391
4392	aSig1 = extractFloat128Frac1( a );
4393	aSig0 = extractFloat128Frac0( a );
4394	aExp = extractFloat128Exp( a );
4395	aSign = extractFloat128Sign( a );
4396	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4397	shiftCount = aExp - 0x402F;
4398	if ( 0 < shiftCount ) {
4399	if ( 0x403E <= aExp ) {
4400	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4401	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4402	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4403	if ( aSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4404	}
4405	else {
4406	float_raise( float_flag_invalid STATUS_VAR);
4407	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
4408	return LIT64( 0x7FFFFFFFFFFFFFFF );
4409	}
4410	}
4411	return (sbits64) LIT64( 0x8000000000000000 );
4412	}
4413	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
4414	if ( (bits64) ( aSig1<<shiftCount ) ) {
4415	STATUS(float_exception_flags) \|= float_flag_inexact;
4416	}
4417	}
4418	else {
4419	if ( aExp < 0x3FFF ) {
4420	if ( aExp \| aSig0 \| aSig1 ) {
4421	STATUS(float_exception_flags) \|= float_flag_inexact;
4422	}
4423	return 0;
4424	}
4425	z = aSig0>>( - shiftCount );
4426	if ( aSig1
4427	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4428	STATUS(float_exception_flags) \|= float_flag_inexact;
4429	}
4430	}
4431	if ( aSign ) z = - z;
4432	return z;
4433
4434	}
4435
4436	/*----------------------------------------------------------------------------
4437	\| Returns the result of converting the quadruple-precision floating-point
4438	\| value `a' to the single-precision floating-point format. The conversion
4439	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4440	\| Arithmetic.
4441	----------------------------------------------------------------------------/
4442
4443	float32 float128_to_float32( float128 a STATUS_PARAM )
4444	{
4445	flag aSign;
4446	int32 aExp;
4447	bits64 aSig0, aSig1;
4448	bits32 zSig;
4449
4450	aSig1 = extractFloat128Frac1( a );
4451	aSig0 = extractFloat128Frac0( a );
4452	aExp = extractFloat128Exp( a );
4453	aSign = extractFloat128Sign( a );
4454	if ( aExp == 0x7FFF ) {
4455	if ( aSig0 \| aSig1 ) {
4456	return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4457	}
4458	return packFloat32( aSign, 0xFF, 0 );
4459	}
4460	aSig0 \|= ( aSig1 != 0 );
4461	shift64RightJamming( aSig0, 18, &aSig0 );
4462	zSig = aSig0;
4463	if ( aExp \|\| zSig ) {
4464	zSig \|= 0x40000000;
4465	aExp -= 0x3F81;
4466	}
4467	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4468
4469	}
4470
4471	/*----------------------------------------------------------------------------
4472	\| Returns the result of converting the quadruple-precision floating-point
4473	\| value `a' to the double-precision floating-point format. The conversion
4474	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4475	\| Arithmetic.
4476	----------------------------------------------------------------------------/
4477
4478	float64 float128_to_float64( float128 a STATUS_PARAM )
4479	{
4480	flag aSign;
4481	int32 aExp;
4482	bits64 aSig0, aSig1;
4483
4484	aSig1 = extractFloat128Frac1( a );
4485	aSig0 = extractFloat128Frac0( a );
4486	aExp = extractFloat128Exp( a );
4487	aSign = extractFloat128Sign( a );
4488	if ( aExp == 0x7FFF ) {
4489	if ( aSig0 \| aSig1 ) {
4490	return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4491	}
4492	return packFloat64( aSign, 0x7FF, 0 );
4493	}
4494	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4495	aSig0 \|= ( aSig1 != 0 );
4496	if ( aExp \|\| aSig0 ) {
4497	aSig0 \|= LIT64( 0x4000000000000000 );
4498	aExp -= 0x3C01;
4499	}
4500	return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4501
4502	}
4503
4504	#ifdef FLOATX80
4505
4506	/*----------------------------------------------------------------------------
4507	\| Returns the result of converting the quadruple-precision floating-point
4508	\| value `a' to the extended double-precision floating-point format. The
4509	\| conversion is performed according to the IEC/IEEE Standard for Binary
4510	\| Floating-Point Arithmetic.
4511	----------------------------------------------------------------------------/
4512
4513	floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4514	{
4515	flag aSign;
4516	int32 aExp;
4517	bits64 aSig0, aSig1;
4518
4519	aSig1 = extractFloat128Frac1( a );
4520	aSig0 = extractFloat128Frac0( a );
4521	aExp = extractFloat128Exp( a );
4522	aSign = extractFloat128Sign( a );
4523	if ( aExp == 0x7FFF ) {
4524	if ( aSig0 \| aSig1 ) {
4525	return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4526	}
4527	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4528	}
4529	if ( aExp == 0 ) {
4530	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4531	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4532	}
4533	else {
4534	aSig0 \|= LIT64( 0x0001000000000000 );
4535	}
4536	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4537	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4538
4539	}
4540
4541	#endif
4542
4543	/*----------------------------------------------------------------------------
4544	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
4545	\| returns the result as a quadruple-precision floating-point value. The
4546	\| operation is performed according to the IEC/IEEE Standard for Binary
4547	\| Floating-Point Arithmetic.
4548	----------------------------------------------------------------------------/
4549
4550	float128 float128_round_to_int( float128 a STATUS_PARAM )
4551	{
4552	flag aSign;
4553	int32 aExp;
4554	bits64 lastBitMask, roundBitsMask;
4555	int8 roundingMode;
4556	float128 z;
4557
4558	aExp = extractFloat128Exp( a );
4559	if ( 0x402F <= aExp ) {
4560	if ( 0x406F <= aExp ) {
4561	if ( ( aExp == 0x7FFF )
4562	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
4563	) {
4564	return propagateFloat128NaN( a, a STATUS_VAR );
4565	}
4566	return a;
4567	}
4568	lastBitMask = 1;
4569	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4570	roundBitsMask = lastBitMask - 1;
4571	z = a;
4572	roundingMode = STATUS(float_rounding_mode);
4573	if ( roundingMode == float_round_nearest_even ) {
4574	if ( lastBitMask ) {
4575	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4576	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4577	}
4578	else {
4579	if ( (sbits64) z.low < 0 ) {
4580	++z.high;
4581	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4582	}
4583	}
4584	}
4585	else if ( roundingMode != float_round_to_zero ) {
4586	if ( extractFloat128Sign( z )
4587	^ ( roundingMode == float_round_up ) ) {
4588	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4589	}
4590	}
4591	z.low &= ~ roundBitsMask;
4592	}
4593	else {
4594	if ( aExp < 0x3FFF ) {
4595	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
4596	STATUS(float_exception_flags) \|= float_flag_inexact;
4597	aSign = extractFloat128Sign( a );
4598	switch ( STATUS(float_rounding_mode) ) {
4599	case float_round_nearest_even:
4600	if ( ( aExp == 0x3FFE )
4601	&& ( extractFloat128Frac0( a )
4602	\| extractFloat128Frac1( a ) )
4603	) {
4604	return packFloat128( aSign, 0x3FFF, 0, 0 );
4605	}
4606	break;
4607	case float_round_down:
4608	return
4609	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4610	: packFloat128( 0, 0, 0, 0 );
4611	case float_round_up:
4612	return
4613	aSign ? packFloat128( 1, 0, 0, 0 )
4614	: packFloat128( 0, 0x3FFF, 0, 0 );
4615	}
4616	return packFloat128( aSign, 0, 0, 0 );
4617	}
4618	lastBitMask = 1;
4619	lastBitMask <<= 0x402F - aExp;
4620	roundBitsMask = lastBitMask - 1;
4621	z.low = 0;
4622	z.high = a.high;
4623	roundingMode = STATUS(float_rounding_mode);
4624	if ( roundingMode == float_round_nearest_even ) {
4625	z.high += lastBitMask>>1;
4626	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
4627	z.high &= ~ lastBitMask;
4628	}
4629	}
4630	else if ( roundingMode != float_round_to_zero ) {
4631	if ( extractFloat128Sign( z )
4632	^ ( roundingMode == float_round_up ) ) {
4633	z.high \|= ( a.low != 0 );
4634	z.high += roundBitsMask;
4635	}
4636	}
4637	z.high &= ~ roundBitsMask;
4638	}
4639	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
4640	STATUS(float_exception_flags) \|= float_flag_inexact;
4641	}
4642	return z;
4643
4644	}
4645
4646	/*----------------------------------------------------------------------------
4647	\| Returns the result of adding the absolute values of the quadruple-precision
4648	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4649	\| before being returned. `zSign' is ignored if the result is a NaN.
4650	\| The addition is performed according to the IEC/IEEE Standard for Binary
4651	\| Floating-Point Arithmetic.
4652	----------------------------------------------------------------------------/
4653
4654	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4655	{
4656	int32 aExp, bExp, zExp;
4657	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4658	int32 expDiff;
4659
4660	aSig1 = extractFloat128Frac1( a );
4661	aSig0 = extractFloat128Frac0( a );
4662	aExp = extractFloat128Exp( a );
4663	bSig1 = extractFloat128Frac1( b );
4664	bSig0 = extractFloat128Frac0( b );
4665	bExp = extractFloat128Exp( b );
4666	expDiff = aExp - bExp;
4667	if ( 0 < expDiff ) {
4668	if ( aExp == 0x7FFF ) {
4669	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4670	return a;
4671	}
4672	if ( bExp == 0 ) {
4673	--expDiff;
4674	}
4675	else {
4676	bSig0 \|= LIT64( 0x0001000000000000 );
4677	}
4678	shift128ExtraRightJamming(
4679	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4680	zExp = aExp;
4681	}
4682	else if ( expDiff < 0 ) {
4683	if ( bExp == 0x7FFF ) {
4684	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4685	return packFloat128( zSign, 0x7FFF, 0, 0 );
4686	}
4687	if ( aExp == 0 ) {
4688	++expDiff;
4689	}
4690	else {
4691	aSig0 \|= LIT64( 0x0001000000000000 );
4692	}
4693	shift128ExtraRightJamming(
4694	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4695	zExp = bExp;
4696	}
4697	else {
4698	if ( aExp == 0x7FFF ) {
4699	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4700	return propagateFloat128NaN( a, b STATUS_VAR );
4701	}
4702	return a;
4703	}
4704	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4705	if ( aExp == 0 ) {
4706	if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4707	return packFloat128( zSign, 0, zSig0, zSig1 );
4708	}
4709	zSig2 = 0;
4710	zSig0 \|= LIT64( 0x0002000000000000 );
4711	zExp = aExp;
4712	goto shiftRight1;
4713	}
4714	aSig0 \|= LIT64( 0x0001000000000000 );
4715	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4716	--zExp;
4717	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4718	++zExp;
4719	shiftRight1:
4720	shift128ExtraRightJamming(
4721	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4722	roundAndPack:
4723	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4724
4725	}
4726
4727	/*----------------------------------------------------------------------------
4728	\| Returns the result of subtracting the absolute values of the quadruple-
4729	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
4730	\| difference is negated before being returned. `zSign' is ignored if the
4731	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
4732	\| Standard for Binary Floating-Point Arithmetic.
4733	----------------------------------------------------------------------------/
4734
4735	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4736	{
4737	int32 aExp, bExp, zExp;
4738	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4739	int32 expDiff;
4740	float128 z;
4741
4742	aSig1 = extractFloat128Frac1( a );
4743	aSig0 = extractFloat128Frac0( a );
4744	aExp = extractFloat128Exp( a );
4745	bSig1 = extractFloat128Frac1( b );
4746	bSig0 = extractFloat128Frac0( b );
4747	bExp = extractFloat128Exp( b );
4748	expDiff = aExp - bExp;
4749	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4750	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4751	if ( 0 < expDiff ) goto aExpBigger;
4752	if ( expDiff < 0 ) goto bExpBigger;
4753	if ( aExp == 0x7FFF ) {
4754	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4755	return propagateFloat128NaN( a, b STATUS_VAR );
4756	}
4757	float_raise( float_flag_invalid STATUS_VAR);
4758	z.low = float128_default_nan_low;
4759	z.high = float128_default_nan_high;
4760	return z;
4761	}
4762	if ( aExp == 0 ) {
4763	aExp = 1;
4764	bExp = 1;
4765	}
4766	if ( bSig0 < aSig0 ) goto aBigger;
4767	if ( aSig0 < bSig0 ) goto bBigger;
4768	if ( bSig1 < aSig1 ) goto aBigger;
4769	if ( aSig1 < bSig1 ) goto bBigger;
4770	return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4771	bExpBigger:
4772	if ( bExp == 0x7FFF ) {
4773	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4774	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4775	}
4776	if ( aExp == 0 ) {
4777	++expDiff;
4778	}
4779	else {
4780	aSig0 \|= LIT64( 0x4000000000000000 );
4781	}
4782	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4783	bSig0 \|= LIT64( 0x4000000000000000 );
4784	bBigger:
4785	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4786	zExp = bExp;
4787	zSign ^= 1;
4788	goto normalizeRoundAndPack;
4789	aExpBigger:
4790	if ( aExp == 0x7FFF ) {
4791	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4792	return a;
4793	}
4794	if ( bExp == 0 ) {
4795	--expDiff;
4796	}
4797	else {
4798	bSig0 \|= LIT64( 0x4000000000000000 );
4799	}
4800	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4801	aSig0 \|= LIT64( 0x4000000000000000 );
4802	aBigger:
4803	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4804	zExp = aExp;
4805	normalizeRoundAndPack:
4806	--zExp;
4807	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4808
4809	}
4810
4811	/*----------------------------------------------------------------------------
4812	\| Returns the result of adding the quadruple-precision floating-point values
4813	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4814	\| for Binary Floating-Point Arithmetic.
4815	----------------------------------------------------------------------------/
4816
4817	float128 float128_add( float128 a, float128 b STATUS_PARAM )
4818	{
4819	flag aSign, bSign;
4820
4821	aSign = extractFloat128Sign( a );
4822	bSign = extractFloat128Sign( b );
4823	if ( aSign == bSign ) {
4824	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4825	}
4826	else {
4827	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4828	}
4829
4830	}
4831
4832	/*----------------------------------------------------------------------------
4833	\| Returns the result of subtracting the quadruple-precision floating-point
4834	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4835	\| Standard for Binary Floating-Point Arithmetic.
4836	----------------------------------------------------------------------------/
4837
4838	float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4839	{
4840	flag aSign, bSign;
4841
4842	aSign = extractFloat128Sign( a );
4843	bSign = extractFloat128Sign( b );
4844	if ( aSign == bSign ) {
4845	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4846	}
4847	else {
4848	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4849	}
4850
4851	}
4852
4853	/*----------------------------------------------------------------------------
4854	\| Returns the result of multiplying the quadruple-precision floating-point
4855	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4856	\| Standard for Binary Floating-Point Arithmetic.
4857	----------------------------------------------------------------------------/
4858
4859	float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4860	{
4861	flag aSign, bSign, zSign;
4862	int32 aExp, bExp, zExp;
4863	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4864	float128 z;
4865
4866	aSig1 = extractFloat128Frac1( a );
4867	aSig0 = extractFloat128Frac0( a );
4868	aExp = extractFloat128Exp( a );
4869	aSign = extractFloat128Sign( a );
4870	bSig1 = extractFloat128Frac1( b );
4871	bSig0 = extractFloat128Frac0( b );
4872	bExp = extractFloat128Exp( b );
4873	bSign = extractFloat128Sign( b );
4874	zSign = aSign ^ bSign;
4875	if ( aExp == 0x7FFF ) {
4876	if ( ( aSig0 \| aSig1 )
4877	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4878	return propagateFloat128NaN( a, b STATUS_VAR );
4879	}
4880	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
4881	return packFloat128( zSign, 0x7FFF, 0, 0 );
4882	}
4883	if ( bExp == 0x7FFF ) {
4884	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4885	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4886	invalid:
4887	float_raise( float_flag_invalid STATUS_VAR);
4888	z.low = float128_default_nan_low;
4889	z.high = float128_default_nan_high;
4890	return z;
4891	}
4892	return packFloat128( zSign, 0x7FFF, 0, 0 );
4893	}
4894	if ( aExp == 0 ) {
4895	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4896	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4897	}
4898	if ( bExp == 0 ) {
4899	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4900	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4901	}
4902	zExp = aExp + bExp - 0x4000;
4903	aSig0 \|= LIT64( 0x0001000000000000 );
4904	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4905	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4906	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4907	zSig2 \|= ( zSig3 != 0 );
4908	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4909	shift128ExtraRightJamming(
4910	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4911	++zExp;
4912	}
4913	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4914
4915	}
4916
4917	/*----------------------------------------------------------------------------
4918	\| Returns the result of dividing the quadruple-precision floating-point value
4919	\| `a' by the corresponding value `b'. The operation is performed according to
4920	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4921	----------------------------------------------------------------------------/
4922
4923	float128 float128_div( float128 a, float128 b STATUS_PARAM )
4924	{
4925	flag aSign, bSign, zSign;
4926	int32 aExp, bExp, zExp;
4927	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4928	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4929	float128 z;
4930
4931	aSig1 = extractFloat128Frac1( a );
4932	aSig0 = extractFloat128Frac0( a );
4933	aExp = extractFloat128Exp( a );
4934	aSign = extractFloat128Sign( a );
4935	bSig1 = extractFloat128Frac1( b );
4936	bSig0 = extractFloat128Frac0( b );
4937	bExp = extractFloat128Exp( b );
4938	bSign = extractFloat128Sign( b );
4939	zSign = aSign ^ bSign;
4940	if ( aExp == 0x7FFF ) {
4941	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4942	if ( bExp == 0x7FFF ) {
4943	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4944	goto invalid;
4945	}
4946	return packFloat128( zSign, 0x7FFF, 0, 0 );
4947	}
4948	if ( bExp == 0x7FFF ) {
4949	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4950	return packFloat128( zSign, 0, 0, 0 );
4951	}
4952	if ( bExp == 0 ) {
4953	if ( ( bSig0 \| bSig1 ) == 0 ) {
4954	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4955	invalid:
4956	float_raise( float_flag_invalid STATUS_VAR);
4957	z.low = float128_default_nan_low;
4958	z.high = float128_default_nan_high;
4959	return z;
4960	}
4961	float_raise( float_flag_divbyzero STATUS_VAR);
4962	return packFloat128( zSign, 0x7FFF, 0, 0 );
4963	}
4964	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4965	}
4966	if ( aExp == 0 ) {
4967	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4968	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4969	}
4970	zExp = aExp - bExp + 0x3FFD;
4971	shortShift128Left(
4972	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4973	shortShift128Left(
4974	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4975	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4976	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4977	++zExp;
4978	}
4979	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4980	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4981	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4982	while ( (sbits64) rem0 < 0 ) {
4983	--zSig0;
4984	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4985	}
4986	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4987	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4988	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4989	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4990	while ( (sbits64) rem1 < 0 ) {
4991	--zSig1;
4992	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4993	}
4994	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4995	}
4996	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4997	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4998
4999	}
5000
5001	/*----------------------------------------------------------------------------
5002	\| Returns the remainder of the quadruple-precision floating-point value `a'
5003	\| with respect to the corresponding value `b'. The operation is performed
5004	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005	----------------------------------------------------------------------------/
5006
5007	float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5008	{
5009	flag aSign, bSign, zSign;
5010	int32 aExp, bExp, expDiff;
5011	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5012	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5013	sbits64 sigMean0;
5014	float128 z;
5015
5016	aSig1 = extractFloat128Frac1( a );
5017	aSig0 = extractFloat128Frac0( a );
5018	aExp = extractFloat128Exp( a );
5019	aSign = extractFloat128Sign( a );
5020	bSig1 = extractFloat128Frac1( b );
5021	bSig0 = extractFloat128Frac0( b );
5022	bExp = extractFloat128Exp( b );
5023	bSign = extractFloat128Sign( b );
5024	if ( aExp == 0x7FFF ) {
5025	if ( ( aSig0 \| aSig1 )
5026	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
5027	return propagateFloat128NaN( a, b STATUS_VAR );
5028	}
5029	goto invalid;
5030	}
5031	if ( bExp == 0x7FFF ) {
5032	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5033	return a;
5034	}
5035	if ( bExp == 0 ) {
5036	if ( ( bSig0 \| bSig1 ) == 0 ) {
5037	invalid:
5038	float_raise( float_flag_invalid STATUS_VAR);
5039	z.low = float128_default_nan_low;
5040	z.high = float128_default_nan_high;
5041	return z;
5042	}
5043	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5044	}
5045	if ( aExp == 0 ) {
5046	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
5047	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5048	}
5049	expDiff = aExp - bExp;
5050	if ( expDiff < -1 ) return a;
5051	shortShift128Left(
5052	aSig0 \| LIT64( 0x0001000000000000 ),
5053	aSig1,
5054	15 - ( expDiff < 0 ),
5055	&aSig0,
5056	&aSig1
5057	);
5058	shortShift128Left(
5059	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5060	q = le128( bSig0, bSig1, aSig0, aSig1 );
5061	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5062	expDiff -= 64;
5063	while ( 0 < expDiff ) {
5064	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5065	q = ( 4 < q ) ? q - 4 : 0;
5066	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5067	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5068	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5069	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5070	expDiff -= 61;
5071	}
5072	if ( -64 < expDiff ) {
5073	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5074	q = ( 4 < q ) ? q - 4 : 0;
5075	q >>= - expDiff;
5076	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5077	expDiff += 52;
5078	if ( expDiff < 0 ) {
5079	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5080	}
5081	else {
5082	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5083	}
5084	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5085	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5086	}
5087	else {
5088	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5089	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5090	}
5091	do {
5092	alternateASig0 = aSig0;
5093	alternateASig1 = aSig1;
5094	++q;
5095	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5096	} while ( 0 <= (sbits64) aSig0 );
5097	add128(
5098	aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5099	if ( ( sigMean0 < 0 )
5100	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5101	aSig0 = alternateASig0;
5102	aSig1 = alternateASig1;
5103	}
5104	zSign = ( (sbits64) aSig0 < 0 );
5105	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5106	return
5107	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5108
5109	}
5110
5111	/*----------------------------------------------------------------------------
5112	\| Returns the square root of the quadruple-precision floating-point value `a'.
5113	\| The operation is performed according to the IEC/IEEE Standard for Binary
5114	\| Floating-Point Arithmetic.
5115	----------------------------------------------------------------------------/
5116
5117	float128 float128_sqrt( float128 a STATUS_PARAM )
5118	{
5119	flag aSign;
5120	int32 aExp, zExp;
5121	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5122	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5123	float128 z;
5124
5125	aSig1 = extractFloat128Frac1( a );
5126	aSig0 = extractFloat128Frac0( a );
5127	aExp = extractFloat128Exp( a );
5128	aSign = extractFloat128Sign( a );
5129	if ( aExp == 0x7FFF ) {
5130	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5131	if ( ! aSign ) return a;
5132	goto invalid;
5133	}
5134	if ( aSign ) {
5135	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
5136	invalid:
5137	float_raise( float_flag_invalid STATUS_VAR);
5138	z.low = float128_default_nan_low;
5139	z.high = float128_default_nan_high;
5140	return z;
5141	}
5142	if ( aExp == 0 ) {
5143	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5144	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5145	}
5146	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5147	aSig0 \|= LIT64( 0x0001000000000000 );
5148	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5149	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5150	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5151	doubleZSig0 = zSig0<<1;
5152	mul64To128( zSig0, zSig0, &term0, &term1 );
5153	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5154	while ( (sbits64) rem0 < 0 ) {
5155	--zSig0;
5156	doubleZSig0 -= 2;
5157	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
5158	}
5159	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5160	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5161	if ( zSig1 == 0 ) zSig1 = 1;
5162	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5163	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5164	mul64To128( zSig1, zSig1, &term2, &term3 );
5165	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5166	while ( (sbits64) rem1 < 0 ) {
5167	--zSig1;
5168	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5169	term3 \|= 1;
5170	term2 \|= doubleZSig0;
5171	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5172	}
5173	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
5174	}
5175	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5176	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5177
5178	}
5179
5180	/*----------------------------------------------------------------------------
5181	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5182	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5183	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184	----------------------------------------------------------------------------/
5185
5186	int float128_eq( float128 a, float128 b STATUS_PARAM )
5187	{
5188
5189	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5190	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5191	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5192	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5193	) {
5194	if ( float128_is_signaling_nan( a )
5195	\|\| float128_is_signaling_nan( b ) ) {
5196	float_raise( float_flag_invalid STATUS_VAR);
5197	}
5198	return 0;
5199	}
5200	return
5201	( a.low == b.low )
5202	&& ( ( a.high == b.high )
5203	\|\| ( ( a.low == 0 )
5204	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5205	);
5206
5207	}
5208
5209	/*----------------------------------------------------------------------------
5210	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5211	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
5212	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5213	\| Arithmetic.
5214	----------------------------------------------------------------------------/
5215
5216	int float128_le( float128 a, float128 b STATUS_PARAM )
5217	{
5218	flag aSign, bSign;
5219
5220	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5221	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5222	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5223	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5224	) {
5225	float_raise( float_flag_invalid STATUS_VAR);
5226	return 0;
5227	}
5228	aSign = extractFloat128Sign( a );
5229	bSign = extractFloat128Sign( b );
5230	if ( aSign != bSign ) {
5231	return
5232	aSign
5233	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5234	== 0 );
5235	}
5236	return
5237	aSign ? le128( b.high, b.low, a.high, a.low )
5238	: le128( a.high, a.low, b.high, b.low );
5239
5240	}
5241
5242	/*----------------------------------------------------------------------------
5243	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5244	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5245	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5246	----------------------------------------------------------------------------/
5247
5248	int float128_lt( float128 a, float128 b STATUS_PARAM )
5249	{
5250	flag aSign, bSign;
5251
5252	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5253	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5254	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5255	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5256	) {
5257	float_raise( float_flag_invalid STATUS_VAR);
5258	return 0;
5259	}
5260	aSign = extractFloat128Sign( a );
5261	bSign = extractFloat128Sign( b );
5262	if ( aSign != bSign ) {
5263	return
5264	aSign
5265	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5266	!= 0 );
5267	}
5268	return
5269	aSign ? lt128( b.high, b.low, a.high, a.low )
5270	: lt128( a.high, a.low, b.high, b.low );
5271
5272	}
5273
5274	/*----------------------------------------------------------------------------
5275	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5276	\| the corresponding value `b', and 0 otherwise. The invalid exception is
5277	\| raised if either operand is a NaN. Otherwise, the comparison is performed
5278	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5279	----------------------------------------------------------------------------/
5280
5281	int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5282	{
5283
5284	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5285	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5286	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5287	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5288	) {
5289	float_raise( float_flag_invalid STATUS_VAR);
5290	return 0;
5291	}
5292	return
5293	( a.low == b.low )
5294	&& ( ( a.high == b.high )
5295	\|\| ( ( a.low == 0 )
5296	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5297	);
5298
5299	}
5300
5301	/*----------------------------------------------------------------------------
5302	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5303	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5304	\| cause an exception. Otherwise, the comparison is performed according to the
5305	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5306	----------------------------------------------------------------------------/
5307
5308	int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5309	{
5310	flag aSign, bSign;
5311
5312	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5313	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5314	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5315	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5316	) {
5317	if ( float128_is_signaling_nan( a )
5318	\|\| float128_is_signaling_nan( b ) ) {
5319	float_raise( float_flag_invalid STATUS_VAR);
5320	}
5321	return 0;
5322	}
5323	aSign = extractFloat128Sign( a );
5324	bSign = extractFloat128Sign( b );
5325	if ( aSign != bSign ) {
5326	return
5327	aSign
5328	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5329	== 0 );
5330	}
5331	return
5332	aSign ? le128( b.high, b.low, a.high, a.low )
5333	: le128( a.high, a.low, b.high, b.low );
5334
5335	}
5336
5337	/*----------------------------------------------------------------------------
5338	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5339	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5340	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5341	\| Standard for Binary Floating-Point Arithmetic.
5342	----------------------------------------------------------------------------/
5343
5344	int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5345	{
5346	flag aSign, bSign;
5347
5348	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5349	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5350	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5351	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5352	) {
5353	if ( float128_is_signaling_nan( a )
5354	\|\| float128_is_signaling_nan( b ) ) {
5355	float_raise( float_flag_invalid STATUS_VAR);
5356	}
5357	return 0;
5358	}
5359	aSign = extractFloat128Sign( a );
5360	bSign = extractFloat128Sign( b );
5361	if ( aSign != bSign ) {
5362	return
5363	aSign
5364	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5365	!= 0 );
5366	}
5367	return
5368	aSign ? lt128( b.high, b.low, a.high, a.low )
5369	: lt128( a.high, a.low, b.high, b.low );
5370
5371	}
5372
5373	#endif
5374
5375	/* misc functions */
5376	float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5377	{
5378	return int64_to_float32(a STATUS_VAR);
5379	}
5380
5381	float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5382	{
5383	return int64_to_float64(a STATUS_VAR);
5384	}
5385
5386	unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5387	{
5388	int64_t v;
5389	unsigned int res;
5390
5391	v = float32_to_int64(a STATUS_VAR);
5392	if (v < 0) {
5393	res = 0;
5394	float_raise( float_flag_invalid STATUS_VAR);
5395	} else if (v > 0xffffffff) {
5396	res = 0xffffffff;
5397	float_raise( float_flag_invalid STATUS_VAR);
5398	} else {
5399	res = v;
5400	}
5401	return res;
5402	}
5403
5404	unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5405	{
5406	int64_t v;
5407	unsigned int res;
5408
5409	v = float32_to_int64_round_to_zero(a STATUS_VAR);
5410	if (v < 0) {
5411	res = 0;
5412	float_raise( float_flag_invalid STATUS_VAR);
5413	} else if (v > 0xffffffff) {
5414	res = 0xffffffff;
5415	float_raise( float_flag_invalid STATUS_VAR);
5416	} else {
5417	res = v;
5418	}
5419	return res;
5420	}
5421
5422	unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5423	{
5424	int64_t v;
5425	unsigned int res;
5426
5427	v = float64_to_int64(a STATUS_VAR);
5428	if (v < 0) {
5429	res = 0;
5430	float_raise( float_flag_invalid STATUS_VAR);
5431	} else if (v > 0xffffffff) {
5432	res = 0xffffffff;
5433	float_raise( float_flag_invalid STATUS_VAR);
5434	} else {
5435	res = v;
5436	}
5437	return res;
5438	}
5439
5440	unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5441	{
5442	int64_t v;
5443	unsigned int res;
5444
5445	v = float64_to_int64_round_to_zero(a STATUS_VAR);
5446	if (v < 0) {
5447	res = 0;
5448	float_raise( float_flag_invalid STATUS_VAR);
5449	} else if (v > 0xffffffff) {
5450	res = 0xffffffff;
5451	float_raise( float_flag_invalid STATUS_VAR);
5452	} else {
5453	res = v;
5454	}
5455	return res;
5456	}
5457
5458	/* FIXME: This looks broken. */
5459	uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5460	{
5461	int64_t v;
5462
5463	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5464	v += float64_val(a);
5465	v = float64_to_int64(make_float64(v) STATUS_VAR);
5466
5467	return v - INT64_MIN;
5468	}
5469
5470	uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5471	{
5472	int64_t v;
5473
5474	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5475	v += float64_val(a);
5476	v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5477
5478	return v - INT64_MIN;
5479	}
5480
5481	#define COMPARE(s, nan_exp) \
5482	INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5483	int is_quiet STATUS_PARAM ) \
5484	{ \
5485	flag aSign, bSign; \
5486	bits ## s av, bv; \
5487	\
5488	if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5489	extractFloat ## s ## Frac( a ) ) \|\| \
5490	( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5491	extractFloat ## s ## Frac( b ) )) { \
5492	if (!is_quiet \|\| \
5493	float ## s ## _is_signaling_nan( a ) \|\| \
5494	float ## s ## _is_signaling_nan( b ) ) { \
5495	float_raise( float_flag_invalid STATUS_VAR); \
5496	} \
5497	return float_relation_unordered; \
5498	} \
5499	aSign = extractFloat ## s ## Sign( a ); \
5500	bSign = extractFloat ## s ## Sign( b ); \
5501	av = float ## s ## _val(a); \
5502	bv = float ## s ## _val(b); \
5503	if ( aSign != bSign ) { \
5504	if ( (bits ## s) ( ( av \| bv )<<1 ) == 0 ) { \
5505	/* zero case */ \
5506	return float_relation_equal; \
5507	} else { \
5508	return 1 - (2 * aSign); \
5509	} \
5510	} else { \
5511	if (av == bv) { \
5512	return float_relation_equal; \
5513	} else { \
5514	return 1 - 2 * (aSign ^ ( av < bv )); \
5515	} \
5516	} \
5517	} \
5518	\
5519	int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5520	{ \
5521	return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5522	} \
5523	\
5524	int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5525	{ \
5526	return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5527	}
5528
5529	COMPARE(32, 0xff)
5530	COMPARE(64, 0x7ff)
5531
5532	INLINE int float128_compare_internal( float128 a, float128 b,
5533	int is_quiet STATUS_PARAM )
5534	{
5535	flag aSign, bSign;
5536
5537	if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5538	( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) ) \|\|
5539	( ( extractFloat128Exp( b ) == 0x7fff ) &&
5540	( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )) {
5541	if (!is_quiet \|\|
5542	float128_is_signaling_nan( a ) \|\|
5543	float128_is_signaling_nan( b ) ) {
5544	float_raise( float_flag_invalid STATUS_VAR);
5545	}
5546	return float_relation_unordered;
5547	}
5548	aSign = extractFloat128Sign( a );
5549	bSign = extractFloat128Sign( b );
5550	if ( aSign != bSign ) {
5551	if ( ( ( ( a.high \| b.high )<<1 ) \| a.low \| b.low ) == 0 ) {
5552	/* zero case */
5553	return float_relation_equal;
5554	} else {
5555	return 1 - (2 * aSign);
5556	}
5557	} else {
5558	if (a.low == b.low && a.high == b.high) {
5559	return float_relation_equal;
5560	} else {
5561	return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5562	}
5563	}
5564	}
5565
5566	int float128_compare( float128 a, float128 b STATUS_PARAM )
5567	{
5568	return float128_compare_internal(a, b, 0 STATUS_VAR);
5569	}
5570
5571	int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5572	{
5573	return float128_compare_internal(a, b, 1 STATUS_VAR);
5574	}
5575
5576	/* Multiply A by 2 raised to the power N. */
5577	float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5578	{
5579	flag aSign;
5580	int16 aExp;
5581	bits32 aSig;
5582
5583	aSig = extractFloat32Frac( a );
5584	aExp = extractFloat32Exp( a );
5585	aSign = extractFloat32Sign( a );
5586
5587	if ( aExp == 0xFF ) {
5588	return a;
5589	}
5590	if ( aExp != 0 )
5591	aSig \|= 0x00800000;
5592	else if ( aSig == 0 )
5593	return a;
5594
5595	aExp += n - 1;
5596	aSig <<= 7;
5597	return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5598	}
5599
5600	float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5601	{
5602	flag aSign;
5603	int16 aExp;
5604	bits64 aSig;
5605
5606	aSig = extractFloat64Frac( a );
5607	aExp = extractFloat64Exp( a );
5608	aSign = extractFloat64Sign( a );
5609
5610	if ( aExp == 0x7FF ) {
5611	return a;
5612	}
5613	if ( aExp != 0 )
5614	aSig \|= LIT64( 0x0010000000000000 );
5615	else if ( aSig == 0 )
5616	return a;
5617
5618	aExp += n - 1;
5619	aSig <<= 10;
5620	return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5621	}
5622
5623	#ifdef FLOATX80
5624	floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5625	{
5626	flag aSign;
5627	int16 aExp;
5628	bits64 aSig;
5629
5630	aSig = extractFloatx80Frac( a );
5631	aExp = extractFloatx80Exp( a );
5632	aSign = extractFloatx80Sign( a );
5633
5634	if ( aExp == 0x7FF ) {
5635	return a;
5636	}
5637	if (aExp == 0 && aSig == 0)
5638	return a;
5639
5640	aExp += n;
5641	return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5642	aSign, aExp, aSig, 0 STATUS_VAR );
5643	}
5644	#endif
5645
5646	#ifdef FLOAT128
5647	float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5648	{
5649	flag aSign;
5650	int32 aExp;
5651	bits64 aSig0, aSig1;
5652
5653	aSig1 = extractFloat128Frac1( a );
5654	aSig0 = extractFloat128Frac0( a );
5655	aExp = extractFloat128Exp( a );
5656	aSign = extractFloat128Sign( a );
5657	if ( aExp == 0x7FFF ) {
5658	return a;
5659	}
5660	if ( aExp != 0 )
5661	aSig0 \|= LIT64( 0x0001000000000000 );
5662	else if ( aSig0 == 0 && aSig1 == 0 )
5663	return a;
5664
5665	aExp += n - 1;
5666	return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5667	STATUS_VAR );
5668
5669	}
5670	#endif

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/recompiler/fpu/softfloat.c@ 22173

以其他格式下載: