softfloat.c@ 36141

最後變更在這個檔案從36141是 36140,由 vboxsync 提交於 14 年前
rem: Re-synced to svn://svn.savannah.nongnu.org/qemu/trunk@5495 (repo UUID c046a42c-6fe2-441c-8c8c-71466251a162).
屬性 svn:eol-style 設為 `native`
檔案大小: 194.2 KB

行
1
2	/*============================================================================
3
4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5	Package, Release 2b.
6
7	Written by John R. Hauser. This work was made possible in part by the
8	International Computer Science Institute, located at Suite 600, 1947 Center
9	Street, Berkeley, California 94704. Funding was partially provided by the
10	National Science Foundation under grant MIP-9311980. The original version
11	of this code was written as part of a project to build a fixed-point vector
12	processor in collaboration with the University of California at Berkeley,
13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15	arithmetic/SoftFloat.html'.
16
17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26	Derivative works are acceptable, even for commercial purposes, so long as
27	(1) the source code for the derivative work includes prominent notice that
28	the work is derivative, and (2) the source code includes prominent notice with
29	these four paragraphs for those parts of this code that are retained.
30
31	=============================================================================*/
32
33	#include "softfloat.h"
34
35	/*----------------------------------------------------------------------------
36	\| Primitive arithmetic functions, including multi-word arithmetic, and
37	\| division and square root approximations. (Can be specialized to target if
38	\| desired.)
39	----------------------------------------------------------------------------/
40	#include "softfloat-macros.h"
41
42	/*----------------------------------------------------------------------------
43	\| Functions and definitions to determine: (1) whether tininess for underflow
44	\| is detected before or after rounding by default, (2) what (if anything)
45	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47	\| are propagated from function inputs to output. These details are target-
48	\| specific.
49	----------------------------------------------------------------------------/
50	#include "softfloat-specialize.h"
51
52	void set_float_rounding_mode(int val STATUS_PARAM)
53	{
54	STATUS(float_rounding_mode) = val;
55	}
56
57	void set_float_exception_flags(int val STATUS_PARAM)
58	{
59	STATUS(float_exception_flags) = val;
60	}
61
62	#ifdef FLOATX80
63	void set_floatx80_rounding_precision(int val STATUS_PARAM)
64	{
65	STATUS(floatx80_rounding_precision) = val;
66	}
67	#endif
68
69	/*----------------------------------------------------------------------------
70	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
72	\| input. If `zSign' is 1, the input is negated before being converted to an
73	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74	\| is simply rounded to an integer, with the inexact exception raised if the
75	\| input cannot be represented exactly as an integer. However, if the fixed-
76	\| point input is too large, the invalid exception is raised and the largest
77	\| positive or negative integer is returned.
78	----------------------------------------------------------------------------/
79
80	static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81	{
82	int8 roundingMode;
83	flag roundNearestEven;
84	int8 roundIncrement, roundBits;
85	int32 z;
86
87	roundingMode = STATUS(float_rounding_mode);
88	roundNearestEven = ( roundingMode == float_round_nearest_even );
89	roundIncrement = 0x40;
90	if ( ! roundNearestEven ) {
91	if ( roundingMode == float_round_to_zero ) {
92	roundIncrement = 0;
93	}
94	else {
95	roundIncrement = 0x7F;
96	if ( zSign ) {
97	if ( roundingMode == float_round_up ) roundIncrement = 0;
98	}
99	else {
100	if ( roundingMode == float_round_down ) roundIncrement = 0;
101	}
102	}
103	}
104	roundBits = absZ & 0x7F;
105	absZ = ( absZ + roundIncrement )>>7;
106	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107	z = absZ;
108	if ( zSign ) z = - z;
109	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110	float_raise( float_flag_invalid STATUS_VAR);
111	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112	}
113	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
114	return z;
115
116	}
117
118	/*----------------------------------------------------------------------------
119	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
121	\| and returns the properly rounded 64-bit integer corresponding to the input.
122	\| If `zSign' is 1, the input is negated before being converted to an integer.
123	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
124	\| the inexact exception raised if the input cannot be represented exactly as
125	\| an integer. However, if the fixed-point input is too large, the invalid
126	\| exception is raised and the largest positive or negative integer is
127	\| returned.
128	----------------------------------------------------------------------------/
129
130	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131	{
132	int8 roundingMode;
133	flag roundNearestEven, increment;
134	int64 z;
135
136	roundingMode = STATUS(float_rounding_mode);
137	roundNearestEven = ( roundingMode == float_round_nearest_even );
138	increment = ( (sbits64) absZ1 < 0 );
139	if ( ! roundNearestEven ) {
140	if ( roundingMode == float_round_to_zero ) {
141	increment = 0;
142	}
143	else {
144	if ( zSign ) {
145	increment = ( roundingMode == float_round_down ) && absZ1;
146	}
147	else {
148	increment = ( roundingMode == float_round_up ) && absZ1;
149	}
150	}
151	}
152	if ( increment ) {
153	++absZ0;
154	if ( absZ0 == 0 ) goto overflow;
155	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156	}
157	z = absZ0;
158	if ( zSign ) z = - z;
159	if ( z && ( ( z < 0 ) ^ zSign ) ) {
160	overflow:
161	float_raise( float_flag_invalid STATUS_VAR);
162	return
163	zSign ? (sbits64) LIT64( 0x8000000000000000 )
164	: LIT64( 0x7FFFFFFFFFFFFFFF );
165	}
166	if ( absZ1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
167	return z;
168
169	}
170
171	/*----------------------------------------------------------------------------
172	\| Returns the fraction bits of the single-precision floating-point value `a'.
173	----------------------------------------------------------------------------/
174
175	INLINE bits32 extractFloat32Frac( float32 a )
176	{
177
178	return float32_val(a) & 0x007FFFFF;
179
180	}
181
182	/*----------------------------------------------------------------------------
183	\| Returns the exponent bits of the single-precision floating-point value `a'.
184	----------------------------------------------------------------------------/
185
186	INLINE int16 extractFloat32Exp( float32 a )
187	{
188
189	return ( float32_val(a)>>23 ) & 0xFF;
190
191	}
192
193	/*----------------------------------------------------------------------------
194	\| Returns the sign bit of the single-precision floating-point value `a'.
195	----------------------------------------------------------------------------/
196
197	INLINE flag extractFloat32Sign( float32 a )
198	{
199
200	return float32_val(a)>>31;
201
202	}
203
204	/*----------------------------------------------------------------------------
205	\| Normalizes the subnormal single-precision floating-point value represented
206	\| by the denormalized significand `aSig'. The normalized exponent and
207	\| significand are stored at the locations pointed to by `zExpPtr' and
208	\| `zSigPtr', respectively.
209	----------------------------------------------------------------------------/
210
211	static void
212	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
213	{
214	int8 shiftCount;
215
216	shiftCount = countLeadingZeros32( aSig ) - 8;
217	*zSigPtr = aSig<<shiftCount;
218	*zExpPtr = 1 - shiftCount;
219
220	}
221
222	/*----------------------------------------------------------------------------
223	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224	\| single-precision floating-point value, returning the result. After being
225	\| shifted into the proper positions, the three fields are simply added
226	\| together to form the result. This means that any integer portion of `zSig'
227	\| will be added into the exponent. Since a properly normalized significand
228	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
229	\| than the desired result exponent whenever `zSig' is a complete, normalized
230	\| significand.
231	----------------------------------------------------------------------------/
232
233	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234	{
235
236	return make_float32(
237	( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
238
239	}
240
241	/*----------------------------------------------------------------------------
242	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
243	\| and significand `zSig', and returns the proper single-precision floating-
244	\| point value corresponding to the abstract input. Ordinarily, the abstract
245	\| value is simply rounded and packed into the single-precision format, with
246	\| the inexact exception raised if the abstract input cannot be represented
247	\| exactly. However, if the abstract value is too large, the overflow and
248	\| inexact exceptions are raised and an infinity or maximal finite value is
249	\| returned. If the abstract value is too small, the input value is rounded to
250	\| a subnormal number, and the underflow and inexact exceptions are raised if
251	\| the abstract input cannot be represented exactly as a subnormal single-
252	\| precision floating-point number.
253	\| The input significand `zSig' has its binary point between bits 30
254	\| and 29, which is 7 bits to the left of the usual location. This shifted
255	\| significand must be normalized or smaller. If `zSig' is not normalized,
256	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
257	\| and it must not require rounding. In the usual case that `zSig' is
258	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
259	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
260	\| Binary Floating-Point Arithmetic.
261	----------------------------------------------------------------------------/
262
263	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
264	{
265	int8 roundingMode;
266	flag roundNearestEven;
267	int8 roundIncrement, roundBits;
268	flag isTiny;
269
270	roundingMode = STATUS(float_rounding_mode);
271	roundNearestEven = ( roundingMode == float_round_nearest_even );
272	roundIncrement = 0x40;
273	if ( ! roundNearestEven ) {
274	if ( roundingMode == float_round_to_zero ) {
275	roundIncrement = 0;
276	}
277	else {
278	roundIncrement = 0x7F;
279	if ( zSign ) {
280	if ( roundingMode == float_round_up ) roundIncrement = 0;
281	}
282	else {
283	if ( roundingMode == float_round_down ) roundIncrement = 0;
284	}
285	}
286	}
287	roundBits = zSig & 0x7F;
288	if ( 0xFD <= (bits16) zExp ) {
289	if ( ( 0xFD < zExp )
290	\|\| ( ( zExp == 0xFD )
291	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
292	) {
293	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
294	return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
295	}
296	if ( zExp < 0 ) {
297	isTiny =
298	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
299	\|\| ( zExp < -1 )
300	\|\| ( zSig + roundIncrement < 0x80000000 );
301	shift32RightJamming( zSig, - zExp, &zSig );
302	zExp = 0;
303	roundBits = zSig & 0x7F;
304	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
305	}
306	}
307	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
308	zSig = ( zSig + roundIncrement )>>7;
309	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
310	if ( zSig == 0 ) zExp = 0;
311	return packFloat32( zSign, zExp, zSig );
312
313	}
314
315	/*----------------------------------------------------------------------------
316	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
317	\| and significand `zSig', and returns the proper single-precision floating-
318	\| point value corresponding to the abstract input. This routine is just like
319	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
320	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
321	\| floating-point exponent.
322	----------------------------------------------------------------------------/
323
324	static float32
325	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
326	{
327	int8 shiftCount;
328
329	shiftCount = countLeadingZeros32( zSig ) - 1;
330	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
331
332	}
333
334	/*----------------------------------------------------------------------------
335	\| Returns the fraction bits of the double-precision floating-point value `a'.
336	----------------------------------------------------------------------------/
337
338	INLINE bits64 extractFloat64Frac( float64 a )
339	{
340
341	return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
342
343	}
344
345	/*----------------------------------------------------------------------------
346	\| Returns the exponent bits of the double-precision floating-point value `a'.
347	----------------------------------------------------------------------------/
348
349	INLINE int16 extractFloat64Exp( float64 a )
350	{
351
352	return ( float64_val(a)>>52 ) & 0x7FF;
353
354	}
355
356	/*----------------------------------------------------------------------------
357	\| Returns the sign bit of the double-precision floating-point value `a'.
358	----------------------------------------------------------------------------/
359
360	INLINE flag extractFloat64Sign( float64 a )
361	{
362
363	return float64_val(a)>>63;
364
365	}
366
367	/*----------------------------------------------------------------------------
368	\| Normalizes the subnormal double-precision floating-point value represented
369	\| by the denormalized significand `aSig'. The normalized exponent and
370	\| significand are stored at the locations pointed to by `zExpPtr' and
371	\| `zSigPtr', respectively.
372	----------------------------------------------------------------------------/
373
374	static void
375	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
376	{
377	int8 shiftCount;
378
379	shiftCount = countLeadingZeros64( aSig ) - 11;
380	*zSigPtr = aSig<<shiftCount;
381	*zExpPtr = 1 - shiftCount;
382
383	}
384
385	/*----------------------------------------------------------------------------
386	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
387	\| double-precision floating-point value, returning the result. After being
388	\| shifted into the proper positions, the three fields are simply added
389	\| together to form the result. This means that any integer portion of `zSig'
390	\| will be added into the exponent. Since a properly normalized significand
391	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
392	\| than the desired result exponent whenever `zSig' is a complete, normalized
393	\| significand.
394	----------------------------------------------------------------------------/
395
396	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
397	{
398
399	return make_float64(
400	( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
401
402	}
403
404	/*----------------------------------------------------------------------------
405	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406	\| and significand `zSig', and returns the proper double-precision floating-
407	\| point value corresponding to the abstract input. Ordinarily, the abstract
408	\| value is simply rounded and packed into the double-precision format, with
409	\| the inexact exception raised if the abstract input cannot be represented
410	\| exactly. However, if the abstract value is too large, the overflow and
411	\| inexact exceptions are raised and an infinity or maximal finite value is
412	\| returned. If the abstract value is too small, the input value is rounded
413	\| to a subnormal number, and the underflow and inexact exceptions are raised
414	\| if the abstract input cannot be represented exactly as a subnormal double-
415	\| precision floating-point number.
416	\| The input significand `zSig' has its binary point between bits 62
417	\| and 61, which is 10 bits to the left of the usual location. This shifted
418	\| significand must be normalized or smaller. If `zSig' is not normalized,
419	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
420	\| and it must not require rounding. In the usual case that `zSig' is
421	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
422	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
423	\| Binary Floating-Point Arithmetic.
424	----------------------------------------------------------------------------/
425
426	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
427	{
428	int8 roundingMode;
429	flag roundNearestEven;
430	int16 roundIncrement, roundBits;
431	flag isTiny;
432
433	roundingMode = STATUS(float_rounding_mode);
434	roundNearestEven = ( roundingMode == float_round_nearest_even );
435	roundIncrement = 0x200;
436	if ( ! roundNearestEven ) {
437	if ( roundingMode == float_round_to_zero ) {
438	roundIncrement = 0;
439	}
440	else {
441	roundIncrement = 0x3FF;
442	if ( zSign ) {
443	if ( roundingMode == float_round_up ) roundIncrement = 0;
444	}
445	else {
446	if ( roundingMode == float_round_down ) roundIncrement = 0;
447	}
448	}
449	}
450	roundBits = zSig & 0x3FF;
451	if ( 0x7FD <= (bits16) zExp ) {
452	if ( ( 0x7FD < zExp )
453	\|\| ( ( zExp == 0x7FD )
454	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
455	) {
456	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
457	return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
458	}
459	if ( zExp < 0 ) {
460	isTiny =
461	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
462	\|\| ( zExp < -1 )
463	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
464	shift64RightJamming( zSig, - zExp, &zSig );
465	zExp = 0;
466	roundBits = zSig & 0x3FF;
467	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
468	}
469	}
470	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
471	zSig = ( zSig + roundIncrement )>>10;
472	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
473	if ( zSig == 0 ) zExp = 0;
474	return packFloat64( zSign, zExp, zSig );
475
476	}
477
478	/*----------------------------------------------------------------------------
479	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
480	\| and significand `zSig', and returns the proper double-precision floating-
481	\| point value corresponding to the abstract input. This routine is just like
482	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
483	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
484	\| floating-point exponent.
485	----------------------------------------------------------------------------/
486
487	static float64
488	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
489	{
490	int8 shiftCount;
491
492	shiftCount = countLeadingZeros64( zSig ) - 1;
493	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
494
495	}
496
497	#ifdef FLOATX80
498
499	/*----------------------------------------------------------------------------
500	\| Returns the fraction bits of the extended double-precision floating-point
501	\| value `a'.
502	----------------------------------------------------------------------------/
503
504	INLINE bits64 extractFloatx80Frac( floatx80 a )
505	{
506
507	return a.low;
508
509	}
510
511	/*----------------------------------------------------------------------------
512	\| Returns the exponent bits of the extended double-precision floating-point
513	\| value `a'.
514	----------------------------------------------------------------------------/
515
516	INLINE int32 extractFloatx80Exp( floatx80 a )
517	{
518
519	return a.high & 0x7FFF;
520
521	}
522
523	/*----------------------------------------------------------------------------
524	\| Returns the sign bit of the extended double-precision floating-point value
525	\| `a'.
526	----------------------------------------------------------------------------/
527
528	INLINE flag extractFloatx80Sign( floatx80 a )
529	{
530
531	return a.high>>15;
532
533	}
534
535	/*----------------------------------------------------------------------------
536	\| Normalizes the subnormal extended double-precision floating-point value
537	\| represented by the denormalized significand `aSig'. The normalized exponent
538	\| and significand are stored at the locations pointed to by `zExpPtr' and
539	\| `zSigPtr', respectively.
540	----------------------------------------------------------------------------/
541
542	static void
543	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
544	{
545	int8 shiftCount;
546
547	shiftCount = countLeadingZeros64( aSig );
548	*zSigPtr = aSig<<shiftCount;
549	*zExpPtr = 1 - shiftCount;
550
551	}
552
553	/*----------------------------------------------------------------------------
554	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
555	\| extended double-precision floating-point value, returning the result.
556	----------------------------------------------------------------------------/
557
558	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
559	{
560	floatx80 z;
561
562	z.low = zSig;
563	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
564	return z;
565
566	}
567
568	/*----------------------------------------------------------------------------
569	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
571	\| and returns the proper extended double-precision floating-point value
572	\| corresponding to the abstract input. Ordinarily, the abstract value is
573	\| rounded and packed into the extended double-precision format, with the
574	\| inexact exception raised if the abstract input cannot be represented
575	\| exactly. However, if the abstract value is too large, the overflow and
576	\| inexact exceptions are raised and an infinity or maximal finite value is
577	\| returned. If the abstract value is too small, the input value is rounded to
578	\| a subnormal number, and the underflow and inexact exceptions are raised if
579	\| the abstract input cannot be represented exactly as a subnormal extended
580	\| double-precision floating-point number.
581	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
582	\| number of bits as single or double precision, respectively. Otherwise, the
583	\| result is rounded to the full precision of the extended double-precision
584	\| format.
585	\| The input significand must be normalized or smaller. If the input
586	\| significand is not normalized, `zExp' must be 0; in that case, the result
587	\| returned is a subnormal number, and it must not require rounding. The
588	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
589	\| Floating-Point Arithmetic.
590	----------------------------------------------------------------------------/
591
592	static floatx80
593	roundAndPackFloatx80(
594	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
595	STATUS_PARAM)
596	{
597	int8 roundingMode;
598	flag roundNearestEven, increment, isTiny;
599	int64 roundIncrement, roundMask, roundBits;
600
601	roundingMode = STATUS(float_rounding_mode);
602	roundNearestEven = ( roundingMode == float_round_nearest_even );
603	if ( roundingPrecision == 80 ) goto precision80;
604	if ( roundingPrecision == 64 ) {
605	roundIncrement = LIT64( 0x0000000000000400 );
606	roundMask = LIT64( 0x00000000000007FF );
607	}
608	else if ( roundingPrecision == 32 ) {
609	roundIncrement = LIT64( 0x0000008000000000 );
610	roundMask = LIT64( 0x000000FFFFFFFFFF );
611	}
612	else {
613	goto precision80;
614	}
615	zSig0 \|= ( zSig1 != 0 );
616	if ( ! roundNearestEven ) {
617	if ( roundingMode == float_round_to_zero ) {
618	roundIncrement = 0;
619	}
620	else {
621	roundIncrement = roundMask;
622	if ( zSign ) {
623	if ( roundingMode == float_round_up ) roundIncrement = 0;
624	}
625	else {
626	if ( roundingMode == float_round_down ) roundIncrement = 0;
627	}
628	}
629	}
630	roundBits = zSig0 & roundMask;
631	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
632	if ( ( 0x7FFE < zExp )
633	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
634	) {
635	goto overflow;
636	}
637	if ( zExp <= 0 ) {
638	isTiny =
639	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
640	\|\| ( zExp < 0 )
641	\|\| ( zSig0 <= zSig0 + roundIncrement );
642	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
643	zExp = 0;
644	roundBits = zSig0 & roundMask;
645	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
646	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
647	zSig0 += roundIncrement;
648	if ( (sbits64) zSig0 < 0 ) zExp = 1;
649	roundIncrement = roundMask + 1;
650	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
651	roundMask \|= roundIncrement;
652	}
653	zSig0 &= ~ roundMask;
654	return packFloatx80( zSign, zExp, zSig0 );
655	}
656	}
657	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
658	zSig0 += roundIncrement;
659	if ( zSig0 < roundIncrement ) {
660	++zExp;
661	zSig0 = LIT64( 0x8000000000000000 );
662	}
663	roundIncrement = roundMask + 1;
664	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
665	roundMask \|= roundIncrement;
666	}
667	zSig0 &= ~ roundMask;
668	if ( zSig0 == 0 ) zExp = 0;
669	return packFloatx80( zSign, zExp, zSig0 );
670	precision80:
671	increment = ( (sbits64) zSig1 < 0 );
672	if ( ! roundNearestEven ) {
673	if ( roundingMode == float_round_to_zero ) {
674	increment = 0;
675	}
676	else {
677	if ( zSign ) {
678	increment = ( roundingMode == float_round_down ) && zSig1;
679	}
680	else {
681	increment = ( roundingMode == float_round_up ) && zSig1;
682	}
683	}
684	}
685	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686	if ( ( 0x7FFE < zExp )
687	\|\| ( ( zExp == 0x7FFE )
688	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
689	&& increment
690	)
691	) {
692	roundMask = 0;
693	overflow:
694	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
695	if ( ( roundingMode == float_round_to_zero )
696	\|\| ( zSign && ( roundingMode == float_round_up ) )
697	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
698	) {
699	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
700	}
701	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
702	}
703	if ( zExp <= 0 ) {
704	isTiny =
705	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
706	\|\| ( zExp < 0 )
707	\|\| ! increment
708	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
709	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
710	zExp = 0;
711	if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
712	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
713	if ( roundNearestEven ) {
714	increment = ( (sbits64) zSig1 < 0 );
715	}
716	else {
717	if ( zSign ) {
718	increment = ( roundingMode == float_round_down ) && zSig1;
719	}
720	else {
721	increment = ( roundingMode == float_round_up ) && zSig1;
722	}
723	}
724	if ( increment ) {
725	++zSig0;
726	zSig0 &=
727	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
728	if ( (sbits64) zSig0 < 0 ) zExp = 1;
729	}
730	return packFloatx80( zSign, zExp, zSig0 );
731	}
732	}
733	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
734	if ( increment ) {
735	++zSig0;
736	if ( zSig0 == 0 ) {
737	++zExp;
738	zSig0 = LIT64( 0x8000000000000000 );
739	}
740	else {
741	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
742	}
743	}
744	else {
745	if ( zSig0 == 0 ) zExp = 0;
746	}
747	return packFloatx80( zSign, zExp, zSig0 );
748
749	}
750
751	/*----------------------------------------------------------------------------
752	\| Takes an abstract floating-point value having sign `zSign', exponent
753	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
754	\| and returns the proper extended double-precision floating-point value
755	\| corresponding to the abstract input. This routine is just like
756	\| `roundAndPackFloatx80' except that the input significand does not have to be
757	\| normalized.
758	----------------------------------------------------------------------------/
759
760	static floatx80
761	normalizeRoundAndPackFloatx80(
762	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
763	STATUS_PARAM)
764	{
765	int8 shiftCount;
766
767	if ( zSig0 == 0 ) {
768	zSig0 = zSig1;
769	zSig1 = 0;
770	zExp -= 64;
771	}
772	shiftCount = countLeadingZeros64( zSig0 );
773	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
774	zExp -= shiftCount;
775	return
776	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
777
778	}
779
780	#endif
781
782	#ifdef FLOAT128
783
784	/*----------------------------------------------------------------------------
785	\| Returns the least-significant 64 fraction bits of the quadruple-precision
786	\| floating-point value `a'.
787	----------------------------------------------------------------------------/
788
789	INLINE bits64 extractFloat128Frac1( float128 a )
790	{
791
792	return a.low;
793
794	}
795
796	/*----------------------------------------------------------------------------
797	\| Returns the most-significant 48 fraction bits of the quadruple-precision
798	\| floating-point value `a'.
799	----------------------------------------------------------------------------/
800
801	INLINE bits64 extractFloat128Frac0( float128 a )
802	{
803
804	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
805
806	}
807
808	/*----------------------------------------------------------------------------
809	\| Returns the exponent bits of the quadruple-precision floating-point value
810	\| `a'.
811	----------------------------------------------------------------------------/
812
813	INLINE int32 extractFloat128Exp( float128 a )
814	{
815
816	return ( a.high>>48 ) & 0x7FFF;
817
818	}
819
820	/*----------------------------------------------------------------------------
821	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
822	----------------------------------------------------------------------------/
823
824	INLINE flag extractFloat128Sign( float128 a )
825	{
826
827	return a.high>>63;
828
829	}
830
831	/*----------------------------------------------------------------------------
832	\| Normalizes the subnormal quadruple-precision floating-point value
833	\| represented by the denormalized significand formed by the concatenation of
834	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
835	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
836	\| significand are stored at the location pointed to by `zSig0Ptr', and the
837	\| least significant 64 bits of the normalized significand are stored at the
838	\| location pointed to by `zSig1Ptr'.
839	----------------------------------------------------------------------------/
840
841	static void
842	normalizeFloat128Subnormal(
843	bits64 aSig0,
844	bits64 aSig1,
845	int32 *zExpPtr,
846	bits64 *zSig0Ptr,
847	bits64 *zSig1Ptr
848	)
849	{
850	int8 shiftCount;
851
852	if ( aSig0 == 0 ) {
853	shiftCount = countLeadingZeros64( aSig1 ) - 15;
854	if ( shiftCount < 0 ) {
855	*zSig0Ptr = aSig1>>( - shiftCount );
856	*zSig1Ptr = aSig1<<( shiftCount & 63 );
857	}
858	else {
859	*zSig0Ptr = aSig1<<shiftCount;
860	*zSig1Ptr = 0;
861	}
862	*zExpPtr = - shiftCount - 63;
863	}
864	else {
865	shiftCount = countLeadingZeros64( aSig0 ) - 15;
866	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
867	*zExpPtr = 1 - shiftCount;
868	}
869
870	}
871
872	/*----------------------------------------------------------------------------
873	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
874	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
875	\| floating-point value, returning the result. After being shifted into the
876	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
877	\| added together to form the most significant 32 bits of the result. This
878	\| means that any integer portion of `zSig0' will be added into the exponent.
879	\| Since a properly normalized significand will have an integer portion equal
880	\| to 1, the `zExp' input should be 1 less than the desired result exponent
881	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
882	\| significand.
883	----------------------------------------------------------------------------/
884
885	INLINE float128
886	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
887	{
888	float128 z;
889
890	z.low = zSig1;
891	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
892	return z;
893
894	}
895
896	/*----------------------------------------------------------------------------
897	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
898	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
899	\| and `zSig2', and returns the proper quadruple-precision floating-point value
900	\| corresponding to the abstract input. Ordinarily, the abstract value is
901	\| simply rounded and packed into the quadruple-precision format, with the
902	\| inexact exception raised if the abstract input cannot be represented
903	\| exactly. However, if the abstract value is too large, the overflow and
904	\| inexact exceptions are raised and an infinity or maximal finite value is
905	\| returned. If the abstract value is too small, the input value is rounded to
906	\| a subnormal number, and the underflow and inexact exceptions are raised if
907	\| the abstract input cannot be represented exactly as a subnormal quadruple-
908	\| precision floating-point number.
909	\| The input significand must be normalized or smaller. If the input
910	\| significand is not normalized, `zExp' must be 0; in that case, the result
911	\| returned is a subnormal number, and it must not require rounding. In the
912	\| usual case that the input significand is normalized, `zExp' must be 1 less
913	\| than the ``true'' floating-point exponent. The handling of underflow and
914	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
915	----------------------------------------------------------------------------/
916
917	static float128
918	roundAndPackFloat128(
919	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
920	{
921	int8 roundingMode;
922	flag roundNearestEven, increment, isTiny;
923
924	roundingMode = STATUS(float_rounding_mode);
925	roundNearestEven = ( roundingMode == float_round_nearest_even );
926	increment = ( (sbits64) zSig2 < 0 );
927	if ( ! roundNearestEven ) {
928	if ( roundingMode == float_round_to_zero ) {
929	increment = 0;
930	}
931	else {
932	if ( zSign ) {
933	increment = ( roundingMode == float_round_down ) && zSig2;
934	}
935	else {
936	increment = ( roundingMode == float_round_up ) && zSig2;
937	}
938	}
939	}
940	if ( 0x7FFD <= (bits32) zExp ) {
941	if ( ( 0x7FFD < zExp )
942	\|\| ( ( zExp == 0x7FFD )
943	&& eq128(
944	LIT64( 0x0001FFFFFFFFFFFF ),
945	LIT64( 0xFFFFFFFFFFFFFFFF ),
946	zSig0,
947	zSig1
948	)
949	&& increment
950	)
951	) {
952	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
953	if ( ( roundingMode == float_round_to_zero )
954	\|\| ( zSign && ( roundingMode == float_round_up ) )
955	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
956	) {
957	return
958	packFloat128(
959	zSign,
960	0x7FFE,
961	LIT64( 0x0000FFFFFFFFFFFF ),
962	LIT64( 0xFFFFFFFFFFFFFFFF )
963	);
964	}
965	return packFloat128( zSign, 0x7FFF, 0, 0 );
966	}
967	if ( zExp < 0 ) {
968	isTiny =
969	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
970	\|\| ( zExp < -1 )
971	\|\| ! increment
972	\|\| lt128(
973	zSig0,
974	zSig1,
975	LIT64( 0x0001FFFFFFFFFFFF ),
976	LIT64( 0xFFFFFFFFFFFFFFFF )
977	);
978	shift128ExtraRightJamming(
979	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
980	zExp = 0;
981	if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
982	if ( roundNearestEven ) {
983	increment = ( (sbits64) zSig2 < 0 );
984	}
985	else {
986	if ( zSign ) {
987	increment = ( roundingMode == float_round_down ) && zSig2;
988	}
989	else {
990	increment = ( roundingMode == float_round_up ) && zSig2;
991	}
992	}
993	}
994	}
995	if ( zSig2 ) STATUS(float_exception_flags) \|= float_flag_inexact;
996	if ( increment ) {
997	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
998	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
999	}
1000	else {
1001	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
1002	}
1003	return packFloat128( zSign, zExp, zSig0, zSig1 );
1004
1005	}
1006
1007	/*----------------------------------------------------------------------------
1008	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
1010	\| returns the proper quadruple-precision floating-point value corresponding
1011	\| to the abstract input. This routine is just like `roundAndPackFloat128'
1012	\| except that the input significand has fewer bits and does not have to be
1013	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1014	\| point exponent.
1015	----------------------------------------------------------------------------/
1016
1017	static float128
1018	normalizeRoundAndPackFloat128(
1019	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1020	{
1021	int8 shiftCount;
1022	bits64 zSig2;
1023
1024	if ( zSig0 == 0 ) {
1025	zSig0 = zSig1;
1026	zSig1 = 0;
1027	zExp -= 64;
1028	}
1029	shiftCount = countLeadingZeros64( zSig0 ) - 15;
1030	if ( 0 <= shiftCount ) {
1031	zSig2 = 0;
1032	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1033	}
1034	else {
1035	shift128ExtraRightJamming(
1036	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1037	}
1038	zExp -= shiftCount;
1039	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1040
1041	}
1042
1043	#endif
1044
1045	/*----------------------------------------------------------------------------
1046	\| Returns the result of converting the 32-bit two's complement integer `a'
1047	\| to the single-precision floating-point format. The conversion is performed
1048	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049	----------------------------------------------------------------------------/
1050
1051	float32 int32_to_float32( int32 a STATUS_PARAM )
1052	{
1053	flag zSign;
1054
1055	if ( a == 0 ) return float32_zero;
1056	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1057	zSign = ( a < 0 );
1058	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1059
1060	}
1061
1062	/*----------------------------------------------------------------------------
1063	\| Returns the result of converting the 32-bit two's complement integer `a'
1064	\| to the double-precision floating-point format. The conversion is performed
1065	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066	----------------------------------------------------------------------------/
1067
1068	float64 int32_to_float64( int32 a STATUS_PARAM )
1069	{
1070	flag zSign;
1071	uint32 absA;
1072	int8 shiftCount;
1073	bits64 zSig;
1074
1075	if ( a == 0 ) return float64_zero;
1076	zSign = ( a < 0 );
1077	absA = zSign ? - a : a;
1078	shiftCount = countLeadingZeros32( absA ) + 21;
1079	zSig = absA;
1080	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1081
1082	}
1083
1084	#ifdef FLOATX80
1085
1086	/*----------------------------------------------------------------------------
1087	\| Returns the result of converting the 32-bit two's complement integer `a'
1088	\| to the extended double-precision floating-point format. The conversion
1089	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1090	\| Arithmetic.
1091	----------------------------------------------------------------------------/
1092
1093	floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1094	{
1095	flag zSign;
1096	uint32 absA;
1097	int8 shiftCount;
1098	bits64 zSig;
1099
1100	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1101	zSign = ( a < 0 );
1102	absA = zSign ? - a : a;
1103	shiftCount = countLeadingZeros32( absA ) + 32;
1104	zSig = absA;
1105	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1106
1107	}
1108
1109	#endif
1110
1111	#ifdef FLOAT128
1112
1113	/*----------------------------------------------------------------------------
1114	\| Returns the result of converting the 32-bit two's complement integer `a' to
1115	\| the quadruple-precision floating-point format. The conversion is performed
1116	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117	----------------------------------------------------------------------------/
1118
1119	float128 int32_to_float128( int32 a STATUS_PARAM )
1120	{
1121	flag zSign;
1122	uint32 absA;
1123	int8 shiftCount;
1124	bits64 zSig0;
1125
1126	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1127	zSign = ( a < 0 );
1128	absA = zSign ? - a : a;
1129	shiftCount = countLeadingZeros32( absA ) + 17;
1130	zSig0 = absA;
1131	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1132
1133	}
1134
1135	#endif
1136
1137	/*----------------------------------------------------------------------------
1138	\| Returns the result of converting the 64-bit two's complement integer `a'
1139	\| to the single-precision floating-point format. The conversion is performed
1140	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141	----------------------------------------------------------------------------/
1142
1143	float32 int64_to_float32( int64 a STATUS_PARAM )
1144	{
1145	flag zSign;
1146	uint64 absA;
1147	int8 shiftCount;
1148
1149	if ( a == 0 ) return float32_zero;
1150	zSign = ( a < 0 );
1151	absA = zSign ? - a : a;
1152	shiftCount = countLeadingZeros64( absA ) - 40;
1153	if ( 0 <= shiftCount ) {
1154	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1155	}
1156	else {
1157	shiftCount += 7;
1158	if ( shiftCount < 0 ) {
1159	shift64RightJamming( absA, - shiftCount, &absA );
1160	}
1161	else {
1162	absA <<= shiftCount;
1163	}
1164	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1165	}
1166
1167	}
1168
1169	float32 uint64_to_float32( uint64 a STATUS_PARAM )
1170	{
1171	int8 shiftCount;
1172
1173	if ( a == 0 ) return float32_zero;
1174	shiftCount = countLeadingZeros64( a ) - 40;
1175	if ( 0 <= shiftCount ) {
1176	return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1177	}
1178	else {
1179	shiftCount += 7;
1180	if ( shiftCount < 0 ) {
1181	shift64RightJamming( a, - shiftCount, &a );
1182	}
1183	else {
1184	a <<= shiftCount;
1185	}
1186	return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1187	}
1188	}
1189
1190	/*----------------------------------------------------------------------------
1191	\| Returns the result of converting the 64-bit two's complement integer `a'
1192	\| to the double-precision floating-point format. The conversion is performed
1193	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194	----------------------------------------------------------------------------/
1195
1196	float64 int64_to_float64( int64 a STATUS_PARAM )
1197	{
1198	flag zSign;
1199
1200	if ( a == 0 ) return float64_zero;
1201	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1202	return packFloat64( 1, 0x43E, 0 );
1203	}
1204	zSign = ( a < 0 );
1205	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1206
1207	}
1208
1209	float64 uint64_to_float64( uint64 a STATUS_PARAM )
1210	{
1211	if ( a == 0 ) return float64_zero;
1212	return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1213
1214	}
1215
1216	#ifdef FLOATX80
1217
1218	/*----------------------------------------------------------------------------
1219	\| Returns the result of converting the 64-bit two's complement integer `a'
1220	\| to the extended double-precision floating-point format. The conversion
1221	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1222	\| Arithmetic.
1223	----------------------------------------------------------------------------/
1224
1225	floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1226	{
1227	flag zSign;
1228	uint64 absA;
1229	int8 shiftCount;
1230
1231	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1232	zSign = ( a < 0 );
1233	absA = zSign ? - a : a;
1234	shiftCount = countLeadingZeros64( absA );
1235	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1236
1237	}
1238
1239	#endif
1240
1241	#ifdef FLOAT128
1242
1243	/*----------------------------------------------------------------------------
1244	\| Returns the result of converting the 64-bit two's complement integer `a' to
1245	\| the quadruple-precision floating-point format. The conversion is performed
1246	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247	----------------------------------------------------------------------------/
1248
1249	float128 int64_to_float128( int64 a STATUS_PARAM )
1250	{
1251	flag zSign;
1252	uint64 absA;
1253	int8 shiftCount;
1254	int32 zExp;
1255	bits64 zSig0, zSig1;
1256
1257	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258	zSign = ( a < 0 );
1259	absA = zSign ? - a : a;
1260	shiftCount = countLeadingZeros64( absA ) + 49;
1261	zExp = 0x406E - shiftCount;
1262	if ( 64 <= shiftCount ) {
1263	zSig1 = 0;
1264	zSig0 = absA;
1265	shiftCount -= 64;
1266	}
1267	else {
1268	zSig1 = absA;
1269	zSig0 = 0;
1270	}
1271	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1272	return packFloat128( zSign, zExp, zSig0, zSig1 );
1273
1274	}
1275
1276	#endif
1277
1278	/*----------------------------------------------------------------------------
1279	\| Returns the result of converting the single-precision floating-point value
1280	\| `a' to the 32-bit two's complement integer format. The conversion is
1281	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282	\| Arithmetic---which means in particular that the conversion is rounded
1283	\| according to the current rounding mode. If `a' is a NaN, the largest
1284	\| positive integer is returned. Otherwise, if the conversion overflows, the
1285	\| largest integer with the same sign as `a' is returned.
1286	----------------------------------------------------------------------------/
1287
1288	int32 float32_to_int32( float32 a STATUS_PARAM )
1289	{
1290	flag aSign;
1291	int16 aExp, shiftCount;
1292	bits32 aSig;
1293	bits64 aSig64;
1294
1295	aSig = extractFloat32Frac( a );
1296	aExp = extractFloat32Exp( a );
1297	aSign = extractFloat32Sign( a );
1298	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1299	if ( aExp ) aSig \|= 0x00800000;
1300	shiftCount = 0xAF - aExp;
1301	aSig64 = aSig;
1302	aSig64 <<= 32;
1303	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1304	return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1305
1306	}
1307
1308	/*----------------------------------------------------------------------------
1309	\| Returns the result of converting the single-precision floating-point value
1310	\| `a' to the 32-bit two's complement integer format. The conversion is
1311	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1312	\| Arithmetic, except that the conversion is always rounded toward zero.
1313	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1314	\| the conversion overflows, the largest integer with the same sign as `a' is
1315	\| returned.
1316	----------------------------------------------------------------------------/
1317
1318	int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1319	{
1320	flag aSign;
1321	int16 aExp, shiftCount;
1322	bits32 aSig;
1323	int32 z;
1324
1325	aSig = extractFloat32Frac( a );
1326	aExp = extractFloat32Exp( a );
1327	aSign = extractFloat32Sign( a );
1328	shiftCount = aExp - 0x9E;
1329	if ( 0 <= shiftCount ) {
1330	if ( float32_val(a) != 0xCF000000 ) {
1331	float_raise( float_flag_invalid STATUS_VAR);
1332	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1333	}
1334	return (sbits32) 0x80000000;
1335	}
1336	else if ( aExp <= 0x7E ) {
1337	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1338	return 0;
1339	}
1340	aSig = ( aSig \| 0x00800000 )<<8;
1341	z = aSig>>( - shiftCount );
1342	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1343	STATUS(float_exception_flags) \|= float_flag_inexact;
1344	}
1345	if ( aSign ) z = - z;
1346	return z;
1347
1348	}
1349
1350	/*----------------------------------------------------------------------------
1351	\| Returns the result of converting the single-precision floating-point value
1352	\| `a' to the 64-bit two's complement integer format. The conversion is
1353	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354	\| Arithmetic---which means in particular that the conversion is rounded
1355	\| according to the current rounding mode. If `a' is a NaN, the largest
1356	\| positive integer is returned. Otherwise, if the conversion overflows, the
1357	\| largest integer with the same sign as `a' is returned.
1358	----------------------------------------------------------------------------/
1359
1360	int64 float32_to_int64( float32 a STATUS_PARAM )
1361	{
1362	flag aSign;
1363	int16 aExp, shiftCount;
1364	bits32 aSig;
1365	bits64 aSig64, aSigExtra;
1366
1367	aSig = extractFloat32Frac( a );
1368	aExp = extractFloat32Exp( a );
1369	aSign = extractFloat32Sign( a );
1370	shiftCount = 0xBE - aExp;
1371	if ( shiftCount < 0 ) {
1372	float_raise( float_flag_invalid STATUS_VAR);
1373	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1374	return LIT64( 0x7FFFFFFFFFFFFFFF );
1375	}
1376	return (sbits64) LIT64( 0x8000000000000000 );
1377	}
1378	if ( aExp ) aSig \|= 0x00800000;
1379	aSig64 = aSig;
1380	aSig64 <<= 40;
1381	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1382	return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1383
1384	}
1385
1386	/*----------------------------------------------------------------------------
1387	\| Returns the result of converting the single-precision floating-point value
1388	\| `a' to the 64-bit two's complement integer format. The conversion is
1389	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1390	\| Arithmetic, except that the conversion is always rounded toward zero. If
1391	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1392	\| conversion overflows, the largest integer with the same sign as `a' is
1393	\| returned.
1394	----------------------------------------------------------------------------/
1395
1396	int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1397	{
1398	flag aSign;
1399	int16 aExp, shiftCount;
1400	bits32 aSig;
1401	bits64 aSig64;
1402	int64 z;
1403
1404	aSig = extractFloat32Frac( a );
1405	aExp = extractFloat32Exp( a );
1406	aSign = extractFloat32Sign( a );
1407	shiftCount = aExp - 0xBE;
1408	if ( 0 <= shiftCount ) {
1409	if ( float32_val(a) != 0xDF000000 ) {
1410	float_raise( float_flag_invalid STATUS_VAR);
1411	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1412	return LIT64( 0x7FFFFFFFFFFFFFFF );
1413	}
1414	}
1415	return (sbits64) LIT64( 0x8000000000000000 );
1416	}
1417	else if ( aExp <= 0x7E ) {
1418	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1419	return 0;
1420	}
1421	aSig64 = aSig \| 0x00800000;
1422	aSig64 <<= 40;
1423	z = aSig64>>( - shiftCount );
1424	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1425	STATUS(float_exception_flags) \|= float_flag_inexact;
1426	}
1427	if ( aSign ) z = - z;
1428	return z;
1429
1430	}
1431
1432	/*----------------------------------------------------------------------------
1433	\| Returns the result of converting the single-precision floating-point value
1434	\| `a' to the double-precision floating-point format. The conversion is
1435	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1436	\| Arithmetic.
1437	----------------------------------------------------------------------------/
1438
1439	float64 float32_to_float64( float32 a STATUS_PARAM )
1440	{
1441	flag aSign;
1442	int16 aExp;
1443	bits32 aSig;
1444
1445	aSig = extractFloat32Frac( a );
1446	aExp = extractFloat32Exp( a );
1447	aSign = extractFloat32Sign( a );
1448	if ( aExp == 0xFF ) {
1449	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1450	return packFloat64( aSign, 0x7FF, 0 );
1451	}
1452	if ( aExp == 0 ) {
1453	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1454	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1455	--aExp;
1456	}
1457	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1458
1459	}
1460
1461	#ifdef FLOATX80
1462
1463	/*----------------------------------------------------------------------------
1464	\| Returns the result of converting the single-precision floating-point value
1465	\| `a' to the extended double-precision floating-point format. The conversion
1466	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1467	\| Arithmetic.
1468	----------------------------------------------------------------------------/
1469
1470	floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1471	{
1472	flag aSign;
1473	int16 aExp;
1474	bits32 aSig;
1475
1476	aSig = extractFloat32Frac( a );
1477	aExp = extractFloat32Exp( a );
1478	aSign = extractFloat32Sign( a );
1479	if ( aExp == 0xFF ) {
1480	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1481	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1482	}
1483	if ( aExp == 0 ) {
1484	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1485	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1486	}
1487	aSig \|= 0x00800000;
1488	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1489
1490	}
1491
1492	#endif
1493
1494	#ifdef FLOAT128
1495
1496	/*----------------------------------------------------------------------------
1497	\| Returns the result of converting the single-precision floating-point value
1498	\| `a' to the double-precision floating-point format. The conversion is
1499	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1500	\| Arithmetic.
1501	----------------------------------------------------------------------------/
1502
1503	float128 float32_to_float128( float32 a STATUS_PARAM )
1504	{
1505	flag aSign;
1506	int16 aExp;
1507	bits32 aSig;
1508
1509	aSig = extractFloat32Frac( a );
1510	aExp = extractFloat32Exp( a );
1511	aSign = extractFloat32Sign( a );
1512	if ( aExp == 0xFF ) {
1513	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1514	return packFloat128( aSign, 0x7FFF, 0, 0 );
1515	}
1516	if ( aExp == 0 ) {
1517	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1518	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519	--aExp;
1520	}
1521	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1522
1523	}
1524
1525	#endif
1526
1527	/*----------------------------------------------------------------------------
1528	\| Rounds the single-precision floating-point value `a' to an integer, and
1529	\| returns the result as a single-precision floating-point value. The
1530	\| operation is performed according to the IEC/IEEE Standard for Binary
1531	\| Floating-Point Arithmetic.
1532	----------------------------------------------------------------------------/
1533
1534	float32 float32_round_to_int( float32 a STATUS_PARAM)
1535	{
1536	flag aSign;
1537	int16 aExp;
1538	bits32 lastBitMask, roundBitsMask;
1539	int8 roundingMode;
1540	bits32 z;
1541
1542	aExp = extractFloat32Exp( a );
1543	if ( 0x96 <= aExp ) {
1544	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1545	return propagateFloat32NaN( a, a STATUS_VAR );
1546	}
1547	return a;
1548	}
1549	if ( aExp <= 0x7E ) {
1550	if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1551	STATUS(float_exception_flags) \|= float_flag_inexact;
1552	aSign = extractFloat32Sign( a );
1553	switch ( STATUS(float_rounding_mode) ) {
1554	case float_round_nearest_even:
1555	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1556	return packFloat32( aSign, 0x7F, 0 );
1557	}
1558	break;
1559	case float_round_down:
1560	return make_float32(aSign ? 0xBF800000 : 0);
1561	case float_round_up:
1562	return make_float32(aSign ? 0x80000000 : 0x3F800000);
1563	}
1564	return packFloat32( aSign, 0, 0 );
1565	}
1566	lastBitMask = 1;
1567	lastBitMask <<= 0x96 - aExp;
1568	roundBitsMask = lastBitMask - 1;
1569	z = float32_val(a);
1570	roundingMode = STATUS(float_rounding_mode);
1571	if ( roundingMode == float_round_nearest_even ) {
1572	z += lastBitMask>>1;
1573	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1574	}
1575	else if ( roundingMode != float_round_to_zero ) {
1576	if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1577	z += roundBitsMask;
1578	}
1579	}
1580	z &= ~ roundBitsMask;
1581	if ( z != float32_val(a) ) STATUS(float_exception_flags) \|= float_flag_inexact;
1582	return make_float32(z);
1583
1584	}
1585
1586	/*----------------------------------------------------------------------------
1587	\| Returns the result of adding the absolute values of the single-precision
1588	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1589	\| before being returned. `zSign' is ignored if the result is a NaN.
1590	\| The addition is performed according to the IEC/IEEE Standard for Binary
1591	\| Floating-Point Arithmetic.
1592	----------------------------------------------------------------------------/
1593
1594	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1595	{
1596	int16 aExp, bExp, zExp;
1597	bits32 aSig, bSig, zSig;
1598	int16 expDiff;
1599
1600	aSig = extractFloat32Frac( a );
1601	aExp = extractFloat32Exp( a );
1602	bSig = extractFloat32Frac( b );
1603	bExp = extractFloat32Exp( b );
1604	expDiff = aExp - bExp;
1605	aSig <<= 6;
1606	bSig <<= 6;
1607	if ( 0 < expDiff ) {
1608	if ( aExp == 0xFF ) {
1609	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1610	return a;
1611	}
1612	if ( bExp == 0 ) {
1613	--expDiff;
1614	}
1615	else {
1616	bSig \|= 0x20000000;
1617	}
1618	shift32RightJamming( bSig, expDiff, &bSig );
1619	zExp = aExp;
1620	}
1621	else if ( expDiff < 0 ) {
1622	if ( bExp == 0xFF ) {
1623	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1624	return packFloat32( zSign, 0xFF, 0 );
1625	}
1626	if ( aExp == 0 ) {
1627	++expDiff;
1628	}
1629	else {
1630	aSig \|= 0x20000000;
1631	}
1632	shift32RightJamming( aSig, - expDiff, &aSig );
1633	zExp = bExp;
1634	}
1635	else {
1636	if ( aExp == 0xFF ) {
1637	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1638	return a;
1639	}
1640	if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1641	zSig = 0x40000000 + aSig + bSig;
1642	zExp = aExp;
1643	goto roundAndPack;
1644	}
1645	aSig \|= 0x20000000;
1646	zSig = ( aSig + bSig )<<1;
1647	--zExp;
1648	if ( (sbits32) zSig < 0 ) {
1649	zSig = aSig + bSig;
1650	++zExp;
1651	}
1652	roundAndPack:
1653	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1654
1655	}
1656
1657	/*----------------------------------------------------------------------------
1658	\| Returns the result of subtracting the absolute values of the single-
1659	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
1660	\| difference is negated before being returned. `zSign' is ignored if the
1661	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
1662	\| Standard for Binary Floating-Point Arithmetic.
1663	----------------------------------------------------------------------------/
1664
1665	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1666	{
1667	int16 aExp, bExp, zExp;
1668	bits32 aSig, bSig, zSig;
1669	int16 expDiff;
1670
1671	aSig = extractFloat32Frac( a );
1672	aExp = extractFloat32Exp( a );
1673	bSig = extractFloat32Frac( b );
1674	bExp = extractFloat32Exp( b );
1675	expDiff = aExp - bExp;
1676	aSig <<= 7;
1677	bSig <<= 7;
1678	if ( 0 < expDiff ) goto aExpBigger;
1679	if ( expDiff < 0 ) goto bExpBigger;
1680	if ( aExp == 0xFF ) {
1681	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1682	float_raise( float_flag_invalid STATUS_VAR);
1683	return float32_default_nan;
1684	}
1685	if ( aExp == 0 ) {
1686	aExp = 1;
1687	bExp = 1;
1688	}
1689	if ( bSig < aSig ) goto aBigger;
1690	if ( aSig < bSig ) goto bBigger;
1691	return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1692	bExpBigger:
1693	if ( bExp == 0xFF ) {
1694	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1695	return packFloat32( zSign ^ 1, 0xFF, 0 );
1696	}
1697	if ( aExp == 0 ) {
1698	++expDiff;
1699	}
1700	else {
1701	aSig \|= 0x40000000;
1702	}
1703	shift32RightJamming( aSig, - expDiff, &aSig );
1704	bSig \|= 0x40000000;
1705	bBigger:
1706	zSig = bSig - aSig;
1707	zExp = bExp;
1708	zSign ^= 1;
1709	goto normalizeRoundAndPack;
1710	aExpBigger:
1711	if ( aExp == 0xFF ) {
1712	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1713	return a;
1714	}
1715	if ( bExp == 0 ) {
1716	--expDiff;
1717	}
1718	else {
1719	bSig \|= 0x40000000;
1720	}
1721	shift32RightJamming( bSig, expDiff, &bSig );
1722	aSig \|= 0x40000000;
1723	aBigger:
1724	zSig = aSig - bSig;
1725	zExp = aExp;
1726	normalizeRoundAndPack:
1727	--zExp;
1728	return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1729
1730	}
1731
1732	/*----------------------------------------------------------------------------
1733	\| Returns the result of adding the single-precision floating-point values `a'
1734	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
1735	\| Binary Floating-Point Arithmetic.
1736	----------------------------------------------------------------------------/
1737
1738	float32 float32_add( float32 a, float32 b STATUS_PARAM )
1739	{
1740	flag aSign, bSign;
1741
1742	aSign = extractFloat32Sign( a );
1743	bSign = extractFloat32Sign( b );
1744	if ( aSign == bSign ) {
1745	return addFloat32Sigs( a, b, aSign STATUS_VAR);
1746	}
1747	else {
1748	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1749	}
1750
1751	}
1752
1753	/*----------------------------------------------------------------------------
1754	\| Returns the result of subtracting the single-precision floating-point values
1755	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1756	\| for Binary Floating-Point Arithmetic.
1757	----------------------------------------------------------------------------/
1758
1759	float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1760	{
1761	flag aSign, bSign;
1762
1763	aSign = extractFloat32Sign( a );
1764	bSign = extractFloat32Sign( b );
1765	if ( aSign == bSign ) {
1766	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1767	}
1768	else {
1769	return addFloat32Sigs( a, b, aSign STATUS_VAR );
1770	}
1771
1772	}
1773
1774	/*----------------------------------------------------------------------------
1775	\| Returns the result of multiplying the single-precision floating-point values
1776	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1777	\| for Binary Floating-Point Arithmetic.
1778	----------------------------------------------------------------------------/
1779
1780	float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1781	{
1782	flag aSign, bSign, zSign;
1783	int16 aExp, bExp, zExp;
1784	bits32 aSig, bSig;
1785	bits64 zSig64;
1786	bits32 zSig;
1787
1788	aSig = extractFloat32Frac( a );
1789	aExp = extractFloat32Exp( a );
1790	aSign = extractFloat32Sign( a );
1791	bSig = extractFloat32Frac( b );
1792	bExp = extractFloat32Exp( b );
1793	bSign = extractFloat32Sign( b );
1794	zSign = aSign ^ bSign;
1795	if ( aExp == 0xFF ) {
1796	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1797	return propagateFloat32NaN( a, b STATUS_VAR );
1798	}
1799	if ( ( bExp \| bSig ) == 0 ) {
1800	float_raise( float_flag_invalid STATUS_VAR);
1801	return float32_default_nan;
1802	}
1803	return packFloat32( zSign, 0xFF, 0 );
1804	}
1805	if ( bExp == 0xFF ) {
1806	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807	if ( ( aExp \| aSig ) == 0 ) {
1808	float_raise( float_flag_invalid STATUS_VAR);
1809	return float32_default_nan;
1810	}
1811	return packFloat32( zSign, 0xFF, 0 );
1812	}
1813	if ( aExp == 0 ) {
1814	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1815	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1816	}
1817	if ( bExp == 0 ) {
1818	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1819	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1820	}
1821	zExp = aExp + bExp - 0x7F;
1822	aSig = ( aSig \| 0x00800000 )<<7;
1823	bSig = ( bSig \| 0x00800000 )<<8;
1824	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1825	zSig = zSig64;
1826	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1827	zSig <<= 1;
1828	--zExp;
1829	}
1830	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1831
1832	}
1833
1834	/*----------------------------------------------------------------------------
1835	\| Returns the result of dividing the single-precision floating-point value `a'
1836	\| by the corresponding value `b'. The operation is performed according to the
1837	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838	----------------------------------------------------------------------------/
1839
1840	float32 float32_div( float32 a, float32 b STATUS_PARAM )
1841	{
1842	flag aSign, bSign, zSign;
1843	int16 aExp, bExp, zExp;
1844	bits32 aSig, bSig, zSig;
1845
1846	aSig = extractFloat32Frac( a );
1847	aExp = extractFloat32Exp( a );
1848	aSign = extractFloat32Sign( a );
1849	bSig = extractFloat32Frac( b );
1850	bExp = extractFloat32Exp( b );
1851	bSign = extractFloat32Sign( b );
1852	zSign = aSign ^ bSign;
1853	if ( aExp == 0xFF ) {
1854	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855	if ( bExp == 0xFF ) {
1856	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1857	float_raise( float_flag_invalid STATUS_VAR);
1858	return float32_default_nan;
1859	}
1860	return packFloat32( zSign, 0xFF, 0 );
1861	}
1862	if ( bExp == 0xFF ) {
1863	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864	return packFloat32( zSign, 0, 0 );
1865	}
1866	if ( bExp == 0 ) {
1867	if ( bSig == 0 ) {
1868	if ( ( aExp \| aSig ) == 0 ) {
1869	float_raise( float_flag_invalid STATUS_VAR);
1870	return float32_default_nan;
1871	}
1872	float_raise( float_flag_divbyzero STATUS_VAR);
1873	return packFloat32( zSign, 0xFF, 0 );
1874	}
1875	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1876	}
1877	if ( aExp == 0 ) {
1878	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1879	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1880	}
1881	zExp = aExp - bExp + 0x7D;
1882	aSig = ( aSig \| 0x00800000 )<<7;
1883	bSig = ( bSig \| 0x00800000 )<<8;
1884	if ( bSig <= ( aSig + aSig ) ) {
1885	aSig >>= 1;
1886	++zExp;
1887	}
1888	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1889	if ( ( zSig & 0x3F ) == 0 ) {
1890	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1891	}
1892	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1893
1894	}
1895
1896	/*----------------------------------------------------------------------------
1897	\| Returns the remainder of the single-precision floating-point value `a'
1898	\| with respect to the corresponding value `b'. The operation is performed
1899	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900	----------------------------------------------------------------------------/
1901
1902	float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1903	{
1904	flag aSign, bSign, zSign;
1905	int16 aExp, bExp, expDiff;
1906	bits32 aSig, bSig;
1907	bits32 q;
1908	bits64 aSig64, bSig64, q64;
1909	bits32 alternateASig;
1910	sbits32 sigMean;
1911
1912	aSig = extractFloat32Frac( a );
1913	aExp = extractFloat32Exp( a );
1914	aSign = extractFloat32Sign( a );
1915	bSig = extractFloat32Frac( b );
1916	bExp = extractFloat32Exp( b );
1917	bSign = extractFloat32Sign( b );
1918	if ( aExp == 0xFF ) {
1919	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1920	return propagateFloat32NaN( a, b STATUS_VAR );
1921	}
1922	float_raise( float_flag_invalid STATUS_VAR);
1923	return float32_default_nan;
1924	}
1925	if ( bExp == 0xFF ) {
1926	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1927	return a;
1928	}
1929	if ( bExp == 0 ) {
1930	if ( bSig == 0 ) {
1931	float_raise( float_flag_invalid STATUS_VAR);
1932	return float32_default_nan;
1933	}
1934	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1935	}
1936	if ( aExp == 0 ) {
1937	if ( aSig == 0 ) return a;
1938	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939	}
1940	expDiff = aExp - bExp;
1941	aSig \|= 0x00800000;
1942	bSig \|= 0x00800000;
1943	if ( expDiff < 32 ) {
1944	aSig <<= 8;
1945	bSig <<= 8;
1946	if ( expDiff < 0 ) {
1947	if ( expDiff < -1 ) return a;
1948	aSig >>= 1;
1949	}
1950	q = ( bSig <= aSig );
1951	if ( q ) aSig -= bSig;
1952	if ( 0 < expDiff ) {
1953	q = ( ( (bits64) aSig )<<32 ) / bSig;
1954	q >>= 32 - expDiff;
1955	bSig >>= 2;
1956	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1957	}
1958	else {
1959	aSig >>= 2;
1960	bSig >>= 2;
1961	}
1962	}
1963	else {
1964	if ( bSig <= aSig ) aSig -= bSig;
1965	aSig64 = ( (bits64) aSig )<<40;
1966	bSig64 = ( (bits64) bSig )<<40;
1967	expDiff -= 64;
1968	while ( 0 < expDiff ) {
1969	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1970	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1971	aSig64 = - ( ( bSig * q64 )<<38 );
1972	expDiff -= 62;
1973	}
1974	expDiff += 64;
1975	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1976	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1977	q = q64>>( 64 - expDiff );
1978	bSig <<= 6;
1979	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1980	}
1981	do {
1982	alternateASig = aSig;
1983	++q;
1984	aSig -= bSig;
1985	} while ( 0 <= (sbits32) aSig );
1986	sigMean = aSig + alternateASig;
1987	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1988	aSig = alternateASig;
1989	}
1990	zSign = ( (sbits32) aSig < 0 );
1991	if ( zSign ) aSig = - aSig;
1992	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1993
1994	}
1995
1996	/*----------------------------------------------------------------------------
1997	\| Returns the square root of the single-precision floating-point value `a'.
1998	\| The operation is performed according to the IEC/IEEE Standard for Binary
1999	\| Floating-Point Arithmetic.
2000	----------------------------------------------------------------------------/
2001
2002	float32 float32_sqrt( float32 a STATUS_PARAM )
2003	{
2004	flag aSign;
2005	int16 aExp, zExp;
2006	bits32 aSig, zSig;
2007	bits64 rem, term;
2008
2009	aSig = extractFloat32Frac( a );
2010	aExp = extractFloat32Exp( a );
2011	aSign = extractFloat32Sign( a );
2012	if ( aExp == 0xFF ) {
2013	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2014	if ( ! aSign ) return a;
2015	float_raise( float_flag_invalid STATUS_VAR);
2016	return float32_default_nan;
2017	}
2018	if ( aSign ) {
2019	if ( ( aExp \| aSig ) == 0 ) return a;
2020	float_raise( float_flag_invalid STATUS_VAR);
2021	return float32_default_nan;
2022	}
2023	if ( aExp == 0 ) {
2024	if ( aSig == 0 ) return float32_zero;
2025	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2026	}
2027	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2028	aSig = ( aSig \| 0x00800000 )<<8;
2029	zSig = estimateSqrt32( aExp, aSig ) + 2;
2030	if ( ( zSig & 0x7F ) <= 5 ) {
2031	if ( zSig < 2 ) {
2032	zSig = 0x7FFFFFFF;
2033	goto roundAndPack;
2034	}
2035	aSig >>= aExp & 1;
2036	term = ( (bits64) zSig ) * zSig;
2037	rem = ( ( (bits64) aSig )<<32 ) - term;
2038	while ( (sbits64) rem < 0 ) {
2039	--zSig;
2040	rem += ( ( (bits64) zSig )<<1 ) \| 1;
2041	}
2042	zSig \|= ( rem != 0 );
2043	}
2044	shift32RightJamming( zSig, 1, &zSig );
2045	roundAndPack:
2046	return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2047
2048	}
2049
2050	/*----------------------------------------------------------------------------
2051	\| Returns 1 if the single-precision floating-point value `a' is equal to
2052	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2053	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054	----------------------------------------------------------------------------/
2055
2056	int float32_eq( float32 a, float32 b STATUS_PARAM )
2057	{
2058
2059	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2060	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2061	) {
2062	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2063	float_raise( float_flag_invalid STATUS_VAR);
2064	}
2065	return 0;
2066	}
2067	return ( float32_val(a) == float32_val(b) ) \|\|
2068	( (bits32) ( ( float32_val(a) \| float32_val(b) )<<1 ) == 0 );
2069
2070	}
2071
2072	/*----------------------------------------------------------------------------
2073	\| Returns 1 if the single-precision floating-point value `a' is less than
2074	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
2075	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2076	\| Arithmetic.
2077	----------------------------------------------------------------------------/
2078
2079	int float32_le( float32 a, float32 b STATUS_PARAM )
2080	{
2081	flag aSign, bSign;
2082	bits32 av, bv;
2083
2084	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2085	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2086	) {
2087	float_raise( float_flag_invalid STATUS_VAR);
2088	return 0;
2089	}
2090	aSign = extractFloat32Sign( a );
2091	bSign = extractFloat32Sign( b );
2092	av = float32_val(a);
2093	bv = float32_val(b);
2094	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2095	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2096
2097	}
2098
2099	/*----------------------------------------------------------------------------
2100	\| Returns 1 if the single-precision floating-point value `a' is less than
2101	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2102	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103	----------------------------------------------------------------------------/
2104
2105	int float32_lt( float32 a, float32 b STATUS_PARAM )
2106	{
2107	flag aSign, bSign;
2108	bits32 av, bv;
2109
2110	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2111	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2112	) {
2113	float_raise( float_flag_invalid STATUS_VAR);
2114	return 0;
2115	}
2116	aSign = extractFloat32Sign( a );
2117	bSign = extractFloat32Sign( b );
2118	av = float32_val(a);
2119	bv = float32_val(b);
2120	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2121	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2122
2123	}
2124
2125	/*----------------------------------------------------------------------------
2126	\| Returns 1 if the single-precision floating-point value `a' is equal to
2127	\| the corresponding value `b', and 0 otherwise. The invalid exception is
2128	\| raised if either operand is a NaN. Otherwise, the comparison is performed
2129	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130	----------------------------------------------------------------------------/
2131
2132	int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2133	{
2134	bits32 av, bv;
2135
2136	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2137	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2138	) {
2139	float_raise( float_flag_invalid STATUS_VAR);
2140	return 0;
2141	}
2142	av = float32_val(a);
2143	bv = float32_val(b);
2144	return ( av == bv ) \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2145
2146	}
2147
2148	/*----------------------------------------------------------------------------
2149	\| Returns 1 if the single-precision floating-point value `a' is less than or
2150	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2151	\| cause an exception. Otherwise, the comparison is performed according to the
2152	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153	----------------------------------------------------------------------------/
2154
2155	int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2156	{
2157	flag aSign, bSign;
2158	bits32 av, bv;
2159
2160	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162	) {
2163	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2164	float_raise( float_flag_invalid STATUS_VAR);
2165	}
2166	return 0;
2167	}
2168	aSign = extractFloat32Sign( a );
2169	bSign = extractFloat32Sign( b );
2170	av = float32_val(a);
2171	bv = float32_val(b);
2172	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2173	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2174
2175	}
2176
2177	/*----------------------------------------------------------------------------
2178	\| Returns 1 if the single-precision floating-point value `a' is less than
2179	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2180	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2181	\| Standard for Binary Floating-Point Arithmetic.
2182	----------------------------------------------------------------------------/
2183
2184	int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2185	{
2186	flag aSign, bSign;
2187	bits32 av, bv;
2188
2189	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2191	) {
2192	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2193	float_raise( float_flag_invalid STATUS_VAR);
2194	}
2195	return 0;
2196	}
2197	aSign = extractFloat32Sign( a );
2198	bSign = extractFloat32Sign( b );
2199	av = float32_val(a);
2200	bv = float32_val(b);
2201	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2202	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2203
2204	}
2205
2206	/*----------------------------------------------------------------------------
2207	\| Returns the result of converting the double-precision floating-point value
2208	\| `a' to the 32-bit two's complement integer format. The conversion is
2209	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2210	\| Arithmetic---which means in particular that the conversion is rounded
2211	\| according to the current rounding mode. If `a' is a NaN, the largest
2212	\| positive integer is returned. Otherwise, if the conversion overflows, the
2213	\| largest integer with the same sign as `a' is returned.
2214	----------------------------------------------------------------------------/
2215
2216	int32 float64_to_int32( float64 a STATUS_PARAM )
2217	{
2218	flag aSign;
2219	int16 aExp, shiftCount;
2220	bits64 aSig;
2221
2222	aSig = extractFloat64Frac( a );
2223	aExp = extractFloat64Exp( a );
2224	aSign = extractFloat64Sign( a );
2225	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2226	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2227	shiftCount = 0x42C - aExp;
2228	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2229	return roundAndPackInt32( aSign, aSig STATUS_VAR );
2230
2231	}
2232
2233	/*----------------------------------------------------------------------------
2234	\| Returns the result of converting the double-precision floating-point value
2235	\| `a' to the 32-bit two's complement integer format. The conversion is
2236	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237	\| Arithmetic, except that the conversion is always rounded toward zero.
2238	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2239	\| the conversion overflows, the largest integer with the same sign as `a' is
2240	\| returned.
2241	----------------------------------------------------------------------------/
2242
2243	int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2244	{
2245	flag aSign;
2246	int16 aExp, shiftCount;
2247	bits64 aSig, savedASig;
2248	int32 z;
2249
2250	aSig = extractFloat64Frac( a );
2251	aExp = extractFloat64Exp( a );
2252	aSign = extractFloat64Sign( a );
2253	if ( 0x41E < aExp ) {
2254	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2255	goto invalid;
2256	}
2257	else if ( aExp < 0x3FF ) {
2258	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2259	return 0;
2260	}
2261	aSig \|= LIT64( 0x0010000000000000 );
2262	shiftCount = 0x433 - aExp;
2263	savedASig = aSig;
2264	aSig >>= shiftCount;
2265	z = aSig;
2266	if ( aSign ) z = - z;
2267	if ( ( z < 0 ) ^ aSign ) {
2268	invalid:
2269	float_raise( float_flag_invalid STATUS_VAR);
2270	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2271	}
2272	if ( ( aSig<<shiftCount ) != savedASig ) {
2273	STATUS(float_exception_flags) \|= float_flag_inexact;
2274	}
2275	return z;
2276
2277	}
2278
2279	/*----------------------------------------------------------------------------
2280	\| Returns the result of converting the double-precision floating-point value
2281	\| `a' to the 64-bit two's complement integer format. The conversion is
2282	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2283	\| Arithmetic---which means in particular that the conversion is rounded
2284	\| according to the current rounding mode. If `a' is a NaN, the largest
2285	\| positive integer is returned. Otherwise, if the conversion overflows, the
2286	\| largest integer with the same sign as `a' is returned.
2287	----------------------------------------------------------------------------/
2288
2289	int64 float64_to_int64( float64 a STATUS_PARAM )
2290	{
2291	flag aSign;
2292	int16 aExp, shiftCount;
2293	bits64 aSig, aSigExtra;
2294
2295	aSig = extractFloat64Frac( a );
2296	aExp = extractFloat64Exp( a );
2297	aSign = extractFloat64Sign( a );
2298	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2299	shiftCount = 0x433 - aExp;
2300	if ( shiftCount <= 0 ) {
2301	if ( 0x43E < aExp ) {
2302	float_raise( float_flag_invalid STATUS_VAR);
2303	if ( ! aSign
2304	\|\| ( ( aExp == 0x7FF )
2305	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2306	) {
2307	return LIT64( 0x7FFFFFFFFFFFFFFF );
2308	}
2309	return (sbits64) LIT64( 0x8000000000000000 );
2310	}
2311	aSigExtra = 0;
2312	aSig <<= - shiftCount;
2313	}
2314	else {
2315	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2316	}
2317	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2318
2319	}
2320
2321	/*----------------------------------------------------------------------------
2322	\| Returns the result of converting the double-precision floating-point value
2323	\| `a' to the 64-bit two's complement integer format. The conversion is
2324	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2325	\| Arithmetic, except that the conversion is always rounded toward zero.
2326	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2327	\| the conversion overflows, the largest integer with the same sign as `a' is
2328	\| returned.
2329	----------------------------------------------------------------------------/
2330
2331	int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2332	{
2333	flag aSign;
2334	int16 aExp, shiftCount;
2335	bits64 aSig;
2336	int64 z;
2337
2338	aSig = extractFloat64Frac( a );
2339	aExp = extractFloat64Exp( a );
2340	aSign = extractFloat64Sign( a );
2341	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2342	shiftCount = aExp - 0x433;
2343	if ( 0 <= shiftCount ) {
2344	if ( 0x43E <= aExp ) {
2345	if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2346	float_raise( float_flag_invalid STATUS_VAR);
2347	if ( ! aSign
2348	\|\| ( ( aExp == 0x7FF )
2349	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2350	) {
2351	return LIT64( 0x7FFFFFFFFFFFFFFF );
2352	}
2353	}
2354	return (sbits64) LIT64( 0x8000000000000000 );
2355	}
2356	z = aSig<<shiftCount;
2357	}
2358	else {
2359	if ( aExp < 0x3FE ) {
2360	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2361	return 0;
2362	}
2363	z = aSig>>( - shiftCount );
2364	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2365	STATUS(float_exception_flags) \|= float_flag_inexact;
2366	}
2367	}
2368	if ( aSign ) z = - z;
2369	return z;
2370
2371	}
2372
2373	/*----------------------------------------------------------------------------
2374	\| Returns the result of converting the double-precision floating-point value
2375	\| `a' to the single-precision floating-point format. The conversion is
2376	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2377	\| Arithmetic.
2378	----------------------------------------------------------------------------/
2379
2380	float32 float64_to_float32( float64 a STATUS_PARAM )
2381	{
2382	flag aSign;
2383	int16 aExp;
2384	bits64 aSig;
2385	bits32 zSig;
2386
2387	aSig = extractFloat64Frac( a );
2388	aExp = extractFloat64Exp( a );
2389	aSign = extractFloat64Sign( a );
2390	if ( aExp == 0x7FF ) {
2391	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2392	return packFloat32( aSign, 0xFF, 0 );
2393	}
2394	shift64RightJamming( aSig, 22, &aSig );
2395	zSig = aSig;
2396	if ( aExp \|\| zSig ) {
2397	zSig \|= 0x40000000;
2398	aExp -= 0x381;
2399	}
2400	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2401
2402	}
2403
2404	#ifdef FLOATX80
2405
2406	/*----------------------------------------------------------------------------
2407	\| Returns the result of converting the double-precision floating-point value
2408	\| `a' to the extended double-precision floating-point format. The conversion
2409	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2410	\| Arithmetic.
2411	----------------------------------------------------------------------------/
2412
2413	floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2414	{
2415	flag aSign;
2416	int16 aExp;
2417	bits64 aSig;
2418
2419	aSig = extractFloat64Frac( a );
2420	aExp = extractFloat64Exp( a );
2421	aSign = extractFloat64Sign( a );
2422	if ( aExp == 0x7FF ) {
2423	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2424	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2425	}
2426	if ( aExp == 0 ) {
2427	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2428	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2429	}
2430	return
2431	packFloatx80(
2432	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
2433
2434	}
2435
2436	#endif
2437
2438	#ifdef FLOAT128
2439
2440	/*----------------------------------------------------------------------------
2441	\| Returns the result of converting the double-precision floating-point value
2442	\| `a' to the quadruple-precision floating-point format. The conversion is
2443	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2444	\| Arithmetic.
2445	----------------------------------------------------------------------------/
2446
2447	float128 float64_to_float128( float64 a STATUS_PARAM )
2448	{
2449	flag aSign;
2450	int16 aExp;
2451	bits64 aSig, zSig0, zSig1;
2452
2453	aSig = extractFloat64Frac( a );
2454	aExp = extractFloat64Exp( a );
2455	aSign = extractFloat64Sign( a );
2456	if ( aExp == 0x7FF ) {
2457	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2458	return packFloat128( aSign, 0x7FFF, 0, 0 );
2459	}
2460	if ( aExp == 0 ) {
2461	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2462	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2463	--aExp;
2464	}
2465	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2466	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2467
2468	}
2469
2470	#endif
2471
2472	/*----------------------------------------------------------------------------
2473	\| Rounds the double-precision floating-point value `a' to an integer, and
2474	\| returns the result as a double-precision floating-point value. The
2475	\| operation is performed according to the IEC/IEEE Standard for Binary
2476	\| Floating-Point Arithmetic.
2477	----------------------------------------------------------------------------/
2478
2479	float64 float64_round_to_int( float64 a STATUS_PARAM )
2480	{
2481	flag aSign;
2482	int16 aExp;
2483	bits64 lastBitMask, roundBitsMask;
2484	int8 roundingMode;
2485	bits64 z;
2486
2487	aExp = extractFloat64Exp( a );
2488	if ( 0x433 <= aExp ) {
2489	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2490	return propagateFloat64NaN( a, a STATUS_VAR );
2491	}
2492	return a;
2493	}
2494	if ( aExp < 0x3FF ) {
2495	if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2496	STATUS(float_exception_flags) \|= float_flag_inexact;
2497	aSign = extractFloat64Sign( a );
2498	switch ( STATUS(float_rounding_mode) ) {
2499	case float_round_nearest_even:
2500	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2501	return packFloat64( aSign, 0x3FF, 0 );
2502	}
2503	break;
2504	case float_round_down:
2505	return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2506	case float_round_up:
2507	return make_float64(
2508	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2509	}
2510	return packFloat64( aSign, 0, 0 );
2511	}
2512	lastBitMask = 1;
2513	lastBitMask <<= 0x433 - aExp;
2514	roundBitsMask = lastBitMask - 1;
2515	z = float64_val(a);
2516	roundingMode = STATUS(float_rounding_mode);
2517	if ( roundingMode == float_round_nearest_even ) {
2518	z += lastBitMask>>1;
2519	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2520	}
2521	else if ( roundingMode != float_round_to_zero ) {
2522	if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2523	z += roundBitsMask;
2524	}
2525	}
2526	z &= ~ roundBitsMask;
2527	if ( z != float64_val(a) )
2528	STATUS(float_exception_flags) \|= float_flag_inexact;
2529	return make_float64(z);
2530
2531	}
2532
2533	float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2534	{
2535	int oldmode;
2536	float64 res;
2537	oldmode = STATUS(float_rounding_mode);
2538	STATUS(float_rounding_mode) = float_round_to_zero;
2539	res = float64_round_to_int(a STATUS_VAR);
2540	STATUS(float_rounding_mode) = oldmode;
2541	return res;
2542	}
2543
2544	/*----------------------------------------------------------------------------
2545	\| Returns the result of adding the absolute values of the double-precision
2546	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2547	\| before being returned. `zSign' is ignored if the result is a NaN.
2548	\| The addition is performed according to the IEC/IEEE Standard for Binary
2549	\| Floating-Point Arithmetic.
2550	----------------------------------------------------------------------------/
2551
2552	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2553	{
2554	int16 aExp, bExp, zExp;
2555	bits64 aSig, bSig, zSig;
2556	int16 expDiff;
2557
2558	aSig = extractFloat64Frac( a );
2559	aExp = extractFloat64Exp( a );
2560	bSig = extractFloat64Frac( b );
2561	bExp = extractFloat64Exp( b );
2562	expDiff = aExp - bExp;
2563	aSig <<= 9;
2564	bSig <<= 9;
2565	if ( 0 < expDiff ) {
2566	if ( aExp == 0x7FF ) {
2567	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2568	return a;
2569	}
2570	if ( bExp == 0 ) {
2571	--expDiff;
2572	}
2573	else {
2574	bSig \|= LIT64( 0x2000000000000000 );
2575	}
2576	shift64RightJamming( bSig, expDiff, &bSig );
2577	zExp = aExp;
2578	}
2579	else if ( expDiff < 0 ) {
2580	if ( bExp == 0x7FF ) {
2581	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582	return packFloat64( zSign, 0x7FF, 0 );
2583	}
2584	if ( aExp == 0 ) {
2585	++expDiff;
2586	}
2587	else {
2588	aSig \|= LIT64( 0x2000000000000000 );
2589	}
2590	shift64RightJamming( aSig, - expDiff, &aSig );
2591	zExp = bExp;
2592	}
2593	else {
2594	if ( aExp == 0x7FF ) {
2595	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2596	return a;
2597	}
2598	if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2599	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2600	zExp = aExp;
2601	goto roundAndPack;
2602	}
2603	aSig \|= LIT64( 0x2000000000000000 );
2604	zSig = ( aSig + bSig )<<1;
2605	--zExp;
2606	if ( (sbits64) zSig < 0 ) {
2607	zSig = aSig + bSig;
2608	++zExp;
2609	}
2610	roundAndPack:
2611	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2612
2613	}
2614
2615	/*----------------------------------------------------------------------------
2616	\| Returns the result of subtracting the absolute values of the double-
2617	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
2618	\| difference is negated before being returned. `zSign' is ignored if the
2619	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
2620	\| Standard for Binary Floating-Point Arithmetic.
2621	----------------------------------------------------------------------------/
2622
2623	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2624	{
2625	int16 aExp, bExp, zExp;
2626	bits64 aSig, bSig, zSig;
2627	int16 expDiff;
2628
2629	aSig = extractFloat64Frac( a );
2630	aExp = extractFloat64Exp( a );
2631	bSig = extractFloat64Frac( b );
2632	bExp = extractFloat64Exp( b );
2633	expDiff = aExp - bExp;
2634	aSig <<= 10;
2635	bSig <<= 10;
2636	if ( 0 < expDiff ) goto aExpBigger;
2637	if ( expDiff < 0 ) goto bExpBigger;
2638	if ( aExp == 0x7FF ) {
2639	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2640	float_raise( float_flag_invalid STATUS_VAR);
2641	return float64_default_nan;
2642	}
2643	if ( aExp == 0 ) {
2644	aExp = 1;
2645	bExp = 1;
2646	}
2647	if ( bSig < aSig ) goto aBigger;
2648	if ( aSig < bSig ) goto bBigger;
2649	return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2650	bExpBigger:
2651	if ( bExp == 0x7FF ) {
2652	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2653	return packFloat64( zSign ^ 1, 0x7FF, 0 );
2654	}
2655	if ( aExp == 0 ) {
2656	++expDiff;
2657	}
2658	else {
2659	aSig \|= LIT64( 0x4000000000000000 );
2660	}
2661	shift64RightJamming( aSig, - expDiff, &aSig );
2662	bSig \|= LIT64( 0x4000000000000000 );
2663	bBigger:
2664	zSig = bSig - aSig;
2665	zExp = bExp;
2666	zSign ^= 1;
2667	goto normalizeRoundAndPack;
2668	aExpBigger:
2669	if ( aExp == 0x7FF ) {
2670	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2671	return a;
2672	}
2673	if ( bExp == 0 ) {
2674	--expDiff;
2675	}
2676	else {
2677	bSig \|= LIT64( 0x4000000000000000 );
2678	}
2679	shift64RightJamming( bSig, expDiff, &bSig );
2680	aSig \|= LIT64( 0x4000000000000000 );
2681	aBigger:
2682	zSig = aSig - bSig;
2683	zExp = aExp;
2684	normalizeRoundAndPack:
2685	--zExp;
2686	return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2687
2688	}
2689
2690	/*----------------------------------------------------------------------------
2691	\| Returns the result of adding the double-precision floating-point values `a'
2692	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
2693	\| Binary Floating-Point Arithmetic.
2694	----------------------------------------------------------------------------/
2695
2696	float64 float64_add( float64 a, float64 b STATUS_PARAM )
2697	{
2698	flag aSign, bSign;
2699
2700	aSign = extractFloat64Sign( a );
2701	bSign = extractFloat64Sign( b );
2702	if ( aSign == bSign ) {
2703	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2704	}
2705	else {
2706	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2707	}
2708
2709	}
2710
2711	/*----------------------------------------------------------------------------
2712	\| Returns the result of subtracting the double-precision floating-point values
2713	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2714	\| for Binary Floating-Point Arithmetic.
2715	----------------------------------------------------------------------------/
2716
2717	float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2718	{
2719	flag aSign, bSign;
2720
2721	aSign = extractFloat64Sign( a );
2722	bSign = extractFloat64Sign( b );
2723	if ( aSign == bSign ) {
2724	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2725	}
2726	else {
2727	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2728	}
2729
2730	}
2731
2732	/*----------------------------------------------------------------------------
2733	\| Returns the result of multiplying the double-precision floating-point values
2734	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2735	\| for Binary Floating-Point Arithmetic.
2736	----------------------------------------------------------------------------/
2737
2738	float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2739	{
2740	flag aSign, bSign, zSign;
2741	int16 aExp, bExp, zExp;
2742	bits64 aSig, bSig, zSig0, zSig1;
2743
2744	aSig = extractFloat64Frac( a );
2745	aExp = extractFloat64Exp( a );
2746	aSign = extractFloat64Sign( a );
2747	bSig = extractFloat64Frac( b );
2748	bExp = extractFloat64Exp( b );
2749	bSign = extractFloat64Sign( b );
2750	zSign = aSign ^ bSign;
2751	if ( aExp == 0x7FF ) {
2752	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2753	return propagateFloat64NaN( a, b STATUS_VAR );
2754	}
2755	if ( ( bExp \| bSig ) == 0 ) {
2756	float_raise( float_flag_invalid STATUS_VAR);
2757	return float64_default_nan;
2758	}
2759	return packFloat64( zSign, 0x7FF, 0 );
2760	}
2761	if ( bExp == 0x7FF ) {
2762	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2763	if ( ( aExp \| aSig ) == 0 ) {
2764	float_raise( float_flag_invalid STATUS_VAR);
2765	return float64_default_nan;
2766	}
2767	return packFloat64( zSign, 0x7FF, 0 );
2768	}
2769	if ( aExp == 0 ) {
2770	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2771	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2772	}
2773	if ( bExp == 0 ) {
2774	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2775	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776	}
2777	zExp = aExp + bExp - 0x3FF;
2778	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2779	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2780	mul64To128( aSig, bSig, &zSig0, &zSig1 );
2781	zSig0 \|= ( zSig1 != 0 );
2782	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2783	zSig0 <<= 1;
2784	--zExp;
2785	}
2786	return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2787
2788	}
2789
2790	/*----------------------------------------------------------------------------
2791	\| Returns the result of dividing the double-precision floating-point value `a'
2792	\| by the corresponding value `b'. The operation is performed according to
2793	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794	----------------------------------------------------------------------------/
2795
2796	float64 float64_div( float64 a, float64 b STATUS_PARAM )
2797	{
2798	flag aSign, bSign, zSign;
2799	int16 aExp, bExp, zExp;
2800	bits64 aSig, bSig, zSig;
2801	bits64 rem0, rem1;
2802	bits64 term0, term1;
2803
2804	aSig = extractFloat64Frac( a );
2805	aExp = extractFloat64Exp( a );
2806	aSign = extractFloat64Sign( a );
2807	bSig = extractFloat64Frac( b );
2808	bExp = extractFloat64Exp( b );
2809	bSign = extractFloat64Sign( b );
2810	zSign = aSign ^ bSign;
2811	if ( aExp == 0x7FF ) {
2812	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2813	if ( bExp == 0x7FF ) {
2814	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2815	float_raise( float_flag_invalid STATUS_VAR);
2816	return float64_default_nan;
2817	}
2818	return packFloat64( zSign, 0x7FF, 0 );
2819	}
2820	if ( bExp == 0x7FF ) {
2821	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822	return packFloat64( zSign, 0, 0 );
2823	}
2824	if ( bExp == 0 ) {
2825	if ( bSig == 0 ) {
2826	if ( ( aExp \| aSig ) == 0 ) {
2827	float_raise( float_flag_invalid STATUS_VAR);
2828	return float64_default_nan;
2829	}
2830	float_raise( float_flag_divbyzero STATUS_VAR);
2831	return packFloat64( zSign, 0x7FF, 0 );
2832	}
2833	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834	}
2835	if ( aExp == 0 ) {
2836	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2837	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838	}
2839	zExp = aExp - bExp + 0x3FD;
2840	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
2841	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2842	if ( bSig <= ( aSig + aSig ) ) {
2843	aSig >>= 1;
2844	++zExp;
2845	}
2846	zSig = estimateDiv128To64( aSig, 0, bSig );
2847	if ( ( zSig & 0x1FF ) <= 2 ) {
2848	mul64To128( bSig, zSig, &term0, &term1 );
2849	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2850	while ( (sbits64) rem0 < 0 ) {
2851	--zSig;
2852	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2853	}
2854	zSig \|= ( rem1 != 0 );
2855	}
2856	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2857
2858	}
2859
2860	/*----------------------------------------------------------------------------
2861	\| Returns the remainder of the double-precision floating-point value `a'
2862	\| with respect to the corresponding value `b'. The operation is performed
2863	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864	----------------------------------------------------------------------------/
2865
2866	float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2867	{
2868	flag aSign, bSign, zSign;
2869	int16 aExp, bExp, expDiff;
2870	bits64 aSig, bSig;
2871	bits64 q, alternateASig;
2872	sbits64 sigMean;
2873
2874	aSig = extractFloat64Frac( a );
2875	aExp = extractFloat64Exp( a );
2876	aSign = extractFloat64Sign( a );
2877	bSig = extractFloat64Frac( b );
2878	bExp = extractFloat64Exp( b );
2879	bSign = extractFloat64Sign( b );
2880	if ( aExp == 0x7FF ) {
2881	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
2882	return propagateFloat64NaN( a, b STATUS_VAR );
2883	}
2884	float_raise( float_flag_invalid STATUS_VAR);
2885	return float64_default_nan;
2886	}
2887	if ( bExp == 0x7FF ) {
2888	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2889	return a;
2890	}
2891	if ( bExp == 0 ) {
2892	if ( bSig == 0 ) {
2893	float_raise( float_flag_invalid STATUS_VAR);
2894	return float64_default_nan;
2895	}
2896	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897	}
2898	if ( aExp == 0 ) {
2899	if ( aSig == 0 ) return a;
2900	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2901	}
2902	expDiff = aExp - bExp;
2903	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
2904	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
2905	if ( expDiff < 0 ) {
2906	if ( expDiff < -1 ) return a;
2907	aSig >>= 1;
2908	}
2909	q = ( bSig <= aSig );
2910	if ( q ) aSig -= bSig;
2911	expDiff -= 64;
2912	while ( 0 < expDiff ) {
2913	q = estimateDiv128To64( aSig, 0, bSig );
2914	q = ( 2 < q ) ? q - 2 : 0;
2915	aSig = - ( ( bSig>>2 ) * q );
2916	expDiff -= 62;
2917	}
2918	expDiff += 64;
2919	if ( 0 < expDiff ) {
2920	q = estimateDiv128To64( aSig, 0, bSig );
2921	q = ( 2 < q ) ? q - 2 : 0;
2922	q >>= 64 - expDiff;
2923	bSig >>= 2;
2924	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2925	}
2926	else {
2927	aSig >>= 2;
2928	bSig >>= 2;
2929	}
2930	do {
2931	alternateASig = aSig;
2932	++q;
2933	aSig -= bSig;
2934	} while ( 0 <= (sbits64) aSig );
2935	sigMean = aSig + alternateASig;
2936	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2937	aSig = alternateASig;
2938	}
2939	zSign = ( (sbits64) aSig < 0 );
2940	if ( zSign ) aSig = - aSig;
2941	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2942
2943	}
2944
2945	/*----------------------------------------------------------------------------
2946	\| Returns the square root of the double-precision floating-point value `a'.
2947	\| The operation is performed according to the IEC/IEEE Standard for Binary
2948	\| Floating-Point Arithmetic.
2949	----------------------------------------------------------------------------/
2950
2951	float64 float64_sqrt( float64 a STATUS_PARAM )
2952	{
2953	flag aSign;
2954	int16 aExp, zExp;
2955	bits64 aSig, zSig, doubleZSig;
2956	bits64 rem0, rem1, term0, term1;
2957
2958	aSig = extractFloat64Frac( a );
2959	aExp = extractFloat64Exp( a );
2960	aSign = extractFloat64Sign( a );
2961	if ( aExp == 0x7FF ) {
2962	if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2963	if ( ! aSign ) return a;
2964	float_raise( float_flag_invalid STATUS_VAR);
2965	return float64_default_nan;
2966	}
2967	if ( aSign ) {
2968	if ( ( aExp \| aSig ) == 0 ) return a;
2969	float_raise( float_flag_invalid STATUS_VAR);
2970	return float64_default_nan;
2971	}
2972	if ( aExp == 0 ) {
2973	if ( aSig == 0 ) return float64_zero;
2974	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975	}
2976	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2977	aSig \|= LIT64( 0x0010000000000000 );
2978	zSig = estimateSqrt32( aExp, aSig>>21 );
2979	aSig <<= 9 - ( aExp & 1 );
2980	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2981	if ( ( zSig & 0x1FF ) <= 5 ) {
2982	doubleZSig = zSig<<1;
2983	mul64To128( zSig, zSig, &term0, &term1 );
2984	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2985	while ( (sbits64) rem0 < 0 ) {
2986	--zSig;
2987	doubleZSig -= 2;
2988	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
2989	}
2990	zSig \|= ( ( rem0 \| rem1 ) != 0 );
2991	}
2992	return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2993
2994	}
2995
2996	/*----------------------------------------------------------------------------
2997	\| Returns 1 if the double-precision floating-point value `a' is equal to the
2998	\| corresponding value `b', and 0 otherwise. The comparison is performed
2999	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000	----------------------------------------------------------------------------/
3001
3002	int float64_eq( float64 a, float64 b STATUS_PARAM )
3003	{
3004	bits64 av, bv;
3005
3006	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3007	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3008	) {
3009	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3010	float_raise( float_flag_invalid STATUS_VAR);
3011	}
3012	return 0;
3013	}
3014	av = float64_val(a);
3015	bv = float64_val(b);
3016	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3017
3018	}
3019
3020	/*----------------------------------------------------------------------------
3021	\| Returns 1 if the double-precision floating-point value `a' is less than or
3022	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3023	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3024	\| Arithmetic.
3025	----------------------------------------------------------------------------/
3026
3027	int float64_le( float64 a, float64 b STATUS_PARAM )
3028	{
3029	flag aSign, bSign;
3030	bits64 av, bv;
3031
3032	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034	) {
3035	float_raise( float_flag_invalid STATUS_VAR);
3036	return 0;
3037	}
3038	aSign = extractFloat64Sign( a );
3039	bSign = extractFloat64Sign( b );
3040	av = float64_val(a);
3041	bv = float64_val(b);
3042	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3043	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3044
3045	}
3046
3047	/*----------------------------------------------------------------------------
3048	\| Returns 1 if the double-precision floating-point value `a' is less than
3049	\| the corresponding value `b', and 0 otherwise. The comparison is performed
3050	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051	----------------------------------------------------------------------------/
3052
3053	int float64_lt( float64 a, float64 b STATUS_PARAM )
3054	{
3055	flag aSign, bSign;
3056	bits64 av, bv;
3057
3058	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3060	) {
3061	float_raise( float_flag_invalid STATUS_VAR);
3062	return 0;
3063	}
3064	aSign = extractFloat64Sign( a );
3065	bSign = extractFloat64Sign( b );
3066	av = float64_val(a);
3067	bv = float64_val(b);
3068	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3069	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3070
3071	}
3072
3073	/*----------------------------------------------------------------------------
3074	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3075	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
3076	\| if either operand is a NaN. Otherwise, the comparison is performed
3077	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078	----------------------------------------------------------------------------/
3079
3080	int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3081	{
3082	bits64 av, bv;
3083
3084	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3085	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3086	) {
3087	float_raise( float_flag_invalid STATUS_VAR);
3088	return 0;
3089	}
3090	av = float64_val(a);
3091	bv = float64_val(b);
3092	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3093
3094	}
3095
3096	/*----------------------------------------------------------------------------
3097	\| Returns 1 if the double-precision floating-point value `a' is less than or
3098	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3099	\| cause an exception. Otherwise, the comparison is performed according to the
3100	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101	----------------------------------------------------------------------------/
3102
3103	int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3104	{
3105	flag aSign, bSign;
3106	bits64 av, bv;
3107
3108	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3109	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3110	) {
3111	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3112	float_raise( float_flag_invalid STATUS_VAR);
3113	}
3114	return 0;
3115	}
3116	aSign = extractFloat64Sign( a );
3117	bSign = extractFloat64Sign( b );
3118	av = float64_val(a);
3119	bv = float64_val(b);
3120	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3121	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3122
3123	}
3124
3125	/*----------------------------------------------------------------------------
3126	\| Returns 1 if the double-precision floating-point value `a' is less than
3127	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3128	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3129	\| Standard for Binary Floating-Point Arithmetic.
3130	----------------------------------------------------------------------------/
3131
3132	int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3133	{
3134	flag aSign, bSign;
3135	bits64 av, bv;
3136
3137	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139	) {
3140	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3141	float_raise( float_flag_invalid STATUS_VAR);
3142	}
3143	return 0;
3144	}
3145	aSign = extractFloat64Sign( a );
3146	bSign = extractFloat64Sign( b );
3147	av = float64_val(a);
3148	bv = float64_val(b);
3149	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3150	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3151
3152	}
3153
3154	#ifdef FLOATX80
3155
3156	/*----------------------------------------------------------------------------
3157	\| Returns the result of converting the extended double-precision floating-
3158	\| point value `a' to the 32-bit two's complement integer format. The
3159	\| conversion is performed according to the IEC/IEEE Standard for Binary
3160	\| Floating-Point Arithmetic---which means in particular that the conversion
3161	\| is rounded according to the current rounding mode. If `a' is a NaN, the
3162	\| largest positive integer is returned. Otherwise, if the conversion
3163	\| overflows, the largest integer with the same sign as `a' is returned.
3164	----------------------------------------------------------------------------/
3165
3166	int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3167	{
3168	flag aSign;
3169	int32 aExp, shiftCount;
3170	bits64 aSig;
3171
3172	aSig = extractFloatx80Frac( a );
3173	aExp = extractFloatx80Exp( a );
3174	aSign = extractFloatx80Sign( a );
3175	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3176	shiftCount = 0x4037 - aExp;
3177	if ( shiftCount <= 0 ) shiftCount = 1;
3178	shift64RightJamming( aSig, shiftCount, &aSig );
3179	return roundAndPackInt32( aSign, aSig STATUS_VAR );
3180
3181	}
3182
3183	/*----------------------------------------------------------------------------
3184	\| Returns the result of converting the extended double-precision floating-
3185	\| point value `a' to the 32-bit two's complement integer format. The
3186	\| conversion is performed according to the IEC/IEEE Standard for Binary
3187	\| Floating-Point Arithmetic, except that the conversion is always rounded
3188	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3189	\| Otherwise, if the conversion overflows, the largest integer with the same
3190	\| sign as `a' is returned.
3191	----------------------------------------------------------------------------/
3192
3193	int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3194	{
3195	flag aSign;
3196	int32 aExp, shiftCount;
3197	bits64 aSig, savedASig;
3198	int32 z;
3199
3200	aSig = extractFloatx80Frac( a );
3201	aExp = extractFloatx80Exp( a );
3202	aSign = extractFloatx80Sign( a );
3203	if ( 0x401E < aExp ) {
3204	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3205	goto invalid;
3206	}
3207	else if ( aExp < 0x3FFF ) {
3208	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3209	return 0;
3210	}
3211	shiftCount = 0x403E - aExp;
3212	savedASig = aSig;
3213	aSig >>= shiftCount;
3214	z = aSig;
3215	if ( aSign ) z = - z;
3216	if ( ( z < 0 ) ^ aSign ) {
3217	invalid:
3218	float_raise( float_flag_invalid STATUS_VAR);
3219	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3220	}
3221	if ( ( aSig<<shiftCount ) != savedASig ) {
3222	STATUS(float_exception_flags) \|= float_flag_inexact;
3223	}
3224	return z;
3225
3226	}
3227
3228	/*----------------------------------------------------------------------------
3229	\| Returns the result of converting the extended double-precision floating-
3230	\| point value `a' to the 64-bit two's complement integer format. The
3231	\| conversion is performed according to the IEC/IEEE Standard for Binary
3232	\| Floating-Point Arithmetic---which means in particular that the conversion
3233	\| is rounded according to the current rounding mode. If `a' is a NaN,
3234	\| the largest positive integer is returned. Otherwise, if the conversion
3235	\| overflows, the largest integer with the same sign as `a' is returned.
3236	----------------------------------------------------------------------------/
3237
3238	int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3239	{
3240	flag aSign;
3241	int32 aExp, shiftCount;
3242	bits64 aSig, aSigExtra;
3243
3244	aSig = extractFloatx80Frac( a );
3245	aExp = extractFloatx80Exp( a );
3246	aSign = extractFloatx80Sign( a );
3247	shiftCount = 0x403E - aExp;
3248	if ( shiftCount <= 0 ) {
3249	if ( shiftCount ) {
3250	float_raise( float_flag_invalid STATUS_VAR);
3251	if ( ! aSign
3252	\|\| ( ( aExp == 0x7FFF )
3253	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3254	) {
3255	return LIT64( 0x7FFFFFFFFFFFFFFF );
3256	}
3257	return (sbits64) LIT64( 0x8000000000000000 );
3258	}
3259	aSigExtra = 0;
3260	}
3261	else {
3262	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3263	}
3264	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3265
3266	}
3267
3268	/*----------------------------------------------------------------------------
3269	\| Returns the result of converting the extended double-precision floating-
3270	\| point value `a' to the 64-bit two's complement integer format. The
3271	\| conversion is performed according to the IEC/IEEE Standard for Binary
3272	\| Floating-Point Arithmetic, except that the conversion is always rounded
3273	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3274	\| Otherwise, if the conversion overflows, the largest integer with the same
3275	\| sign as `a' is returned.
3276	----------------------------------------------------------------------------/
3277
3278	int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3279	{
3280	flag aSign;
3281	int32 aExp, shiftCount;
3282	bits64 aSig;
3283	int64 z;
3284
3285	aSig = extractFloatx80Frac( a );
3286	aExp = extractFloatx80Exp( a );
3287	aSign = extractFloatx80Sign( a );
3288	shiftCount = aExp - 0x403E;
3289	if ( 0 <= shiftCount ) {
3290	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3291	if ( ( a.high != 0xC03E ) \|\| aSig ) {
3292	float_raise( float_flag_invalid STATUS_VAR);
3293	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
3294	return LIT64( 0x7FFFFFFFFFFFFFFF );
3295	}
3296	}
3297	return (sbits64) LIT64( 0x8000000000000000 );
3298	}
3299	else if ( aExp < 0x3FFF ) {
3300	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3301	return 0;
3302	}
3303	z = aSig>>( - shiftCount );
3304	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3305	STATUS(float_exception_flags) \|= float_flag_inexact;
3306	}
3307	if ( aSign ) z = - z;
3308	return z;
3309
3310	}
3311
3312	/*----------------------------------------------------------------------------
3313	\| Returns the result of converting the extended double-precision floating-
3314	\| point value `a' to the single-precision floating-point format. The
3315	\| conversion is performed according to the IEC/IEEE Standard for Binary
3316	\| Floating-Point Arithmetic.
3317	----------------------------------------------------------------------------/
3318
3319	float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3320	{
3321	flag aSign;
3322	int32 aExp;
3323	bits64 aSig;
3324
3325	aSig = extractFloatx80Frac( a );
3326	aExp = extractFloatx80Exp( a );
3327	aSign = extractFloatx80Sign( a );
3328	if ( aExp == 0x7FFF ) {
3329	if ( (bits64) ( aSig<<1 ) ) {
3330	return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3331	}
3332	return packFloat32( aSign, 0xFF, 0 );
3333	}
3334	shift64RightJamming( aSig, 33, &aSig );
3335	if ( aExp \|\| aSig ) aExp -= 0x3F81;
3336	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3337
3338	}
3339
3340	/*----------------------------------------------------------------------------
3341	\| Returns the result of converting the extended double-precision floating-
3342	\| point value `a' to the double-precision floating-point format. The
3343	\| conversion is performed according to the IEC/IEEE Standard for Binary
3344	\| Floating-Point Arithmetic.
3345	----------------------------------------------------------------------------/
3346
3347	float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3348	{
3349	flag aSign;
3350	int32 aExp;
3351	bits64 aSig, zSig;
3352
3353	aSig = extractFloatx80Frac( a );
3354	aExp = extractFloatx80Exp( a );
3355	aSign = extractFloatx80Sign( a );
3356	if ( aExp == 0x7FFF ) {
3357	if ( (bits64) ( aSig<<1 ) ) {
3358	return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3359	}
3360	return packFloat64( aSign, 0x7FF, 0 );
3361	}
3362	shift64RightJamming( aSig, 1, &zSig );
3363	if ( aExp \|\| aSig ) aExp -= 0x3C01;
3364	return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3365
3366	}
3367
3368	#ifdef FLOAT128
3369
3370	/*----------------------------------------------------------------------------
3371	\| Returns the result of converting the extended double-precision floating-
3372	\| point value `a' to the quadruple-precision floating-point format. The
3373	\| conversion is performed according to the IEC/IEEE Standard for Binary
3374	\| Floating-Point Arithmetic.
3375	----------------------------------------------------------------------------/
3376
3377	float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3378	{
3379	flag aSign;
3380	int16 aExp;
3381	bits64 aSig, zSig0, zSig1;
3382
3383	aSig = extractFloatx80Frac( a );
3384	aExp = extractFloatx80Exp( a );
3385	aSign = extractFloatx80Sign( a );
3386	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3387	return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3388	}
3389	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3390	return packFloat128( aSign, aExp, zSig0, zSig1 );
3391
3392	}
3393
3394	#endif
3395
3396	/*----------------------------------------------------------------------------
3397	\| Rounds the extended double-precision floating-point value `a' to an integer,
3398	\| and returns the result as an extended quadruple-precision floating-point
3399	\| value. The operation is performed according to the IEC/IEEE Standard for
3400	\| Binary Floating-Point Arithmetic.
3401	----------------------------------------------------------------------------/
3402
3403	floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3404	{
3405	flag aSign;
3406	int32 aExp;
3407	bits64 lastBitMask, roundBitsMask;
3408	int8 roundingMode;
3409	floatx80 z;
3410
3411	aExp = extractFloatx80Exp( a );
3412	if ( 0x403E <= aExp ) {
3413	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3414	return propagateFloatx80NaN( a, a STATUS_VAR );
3415	}
3416	return a;
3417	}
3418	if ( aExp < 0x3FFF ) {
3419	if ( ( aExp == 0 )
3420	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3421	return a;
3422	}
3423	STATUS(float_exception_flags) \|= float_flag_inexact;
3424	aSign = extractFloatx80Sign( a );
3425	switch ( STATUS(float_rounding_mode) ) {
3426	case float_round_nearest_even:
3427	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3428	) {
3429	return
3430	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3431	}
3432	break;
3433	case float_round_down:
3434	return
3435	aSign ?
3436	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437	: packFloatx80( 0, 0, 0 );
3438	case float_round_up:
3439	return
3440	aSign ? packFloatx80( 1, 0, 0 )
3441	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3442	}
3443	return packFloatx80( aSign, 0, 0 );
3444	}
3445	lastBitMask = 1;
3446	lastBitMask <<= 0x403E - aExp;
3447	roundBitsMask = lastBitMask - 1;
3448	z = a;
3449	roundingMode = STATUS(float_rounding_mode);
3450	if ( roundingMode == float_round_nearest_even ) {
3451	z.low += lastBitMask>>1;
3452	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3453	}
3454	else if ( roundingMode != float_round_to_zero ) {
3455	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3456	z.low += roundBitsMask;
3457	}
3458	}
3459	z.low &= ~ roundBitsMask;
3460	if ( z.low == 0 ) {
3461	++z.high;
3462	z.low = LIT64( 0x8000000000000000 );
3463	}
3464	if ( z.low != a.low ) STATUS(float_exception_flags) \|= float_flag_inexact;
3465	return z;
3466
3467	}
3468
3469	/*----------------------------------------------------------------------------
3470	\| Returns the result of adding the absolute values of the extended double-
3471	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3472	\| negated before being returned. `zSign' is ignored if the result is a NaN.
3473	\| The addition is performed according to the IEC/IEEE Standard for Binary
3474	\| Floating-Point Arithmetic.
3475	----------------------------------------------------------------------------/
3476
3477	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3478	{
3479	int32 aExp, bExp, zExp;
3480	bits64 aSig, bSig, zSig0, zSig1;
3481	int32 expDiff;
3482
3483	aSig = extractFloatx80Frac( a );
3484	aExp = extractFloatx80Exp( a );
3485	bSig = extractFloatx80Frac( b );
3486	bExp = extractFloatx80Exp( b );
3487	expDiff = aExp - bExp;
3488	if ( 0 < expDiff ) {
3489	if ( aExp == 0x7FFF ) {
3490	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3491	return a;
3492	}
3493	if ( bExp == 0 ) --expDiff;
3494	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3495	zExp = aExp;
3496	}
3497	else if ( expDiff < 0 ) {
3498	if ( bExp == 0x7FFF ) {
3499	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3500	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3501	}
3502	if ( aExp == 0 ) ++expDiff;
3503	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3504	zExp = bExp;
3505	}
3506	else {
3507	if ( aExp == 0x7FFF ) {
3508	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3509	return propagateFloatx80NaN( a, b STATUS_VAR );
3510	}
3511	return a;
3512	}
3513	zSig1 = 0;
3514	zSig0 = aSig + bSig;
3515	if ( aExp == 0 ) {
3516	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3517	goto roundAndPack;
3518	}
3519	zExp = aExp;
3520	goto shiftRight1;
3521	}
3522	zSig0 = aSig + bSig;
3523	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3524	shiftRight1:
3525	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3526	zSig0 \|= LIT64( 0x8000000000000000 );
3527	++zExp;
3528	roundAndPack:
3529	return
3530	roundAndPackFloatx80(
3531	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3532
3533	}
3534
3535	/*----------------------------------------------------------------------------
3536	\| Returns the result of subtracting the absolute values of the extended
3537	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3538	\| difference is negated before being returned. `zSign' is ignored if the
3539	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
3540	\| Standard for Binary Floating-Point Arithmetic.
3541	----------------------------------------------------------------------------/
3542
3543	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3544	{
3545	int32 aExp, bExp, zExp;
3546	bits64 aSig, bSig, zSig0, zSig1;
3547	int32 expDiff;
3548	floatx80 z;
3549
3550	aSig = extractFloatx80Frac( a );
3551	aExp = extractFloatx80Exp( a );
3552	bSig = extractFloatx80Frac( b );
3553	bExp = extractFloatx80Exp( b );
3554	expDiff = aExp - bExp;
3555	if ( 0 < expDiff ) goto aExpBigger;
3556	if ( expDiff < 0 ) goto bExpBigger;
3557	if ( aExp == 0x7FFF ) {
3558	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3559	return propagateFloatx80NaN( a, b STATUS_VAR );
3560	}
3561	float_raise( float_flag_invalid STATUS_VAR);
3562	z.low = floatx80_default_nan_low;
3563	z.high = floatx80_default_nan_high;
3564	return z;
3565	}
3566	if ( aExp == 0 ) {
3567	aExp = 1;
3568	bExp = 1;
3569	}
3570	zSig1 = 0;
3571	if ( bSig < aSig ) goto aBigger;
3572	if ( aSig < bSig ) goto bBigger;
3573	return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3574	bExpBigger:
3575	if ( bExp == 0x7FFF ) {
3576	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3577	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3578	}
3579	if ( aExp == 0 ) ++expDiff;
3580	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3581	bBigger:
3582	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3583	zExp = bExp;
3584	zSign ^= 1;
3585	goto normalizeRoundAndPack;
3586	aExpBigger:
3587	if ( aExp == 0x7FFF ) {
3588	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589	return a;
3590	}
3591	if ( bExp == 0 ) --expDiff;
3592	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3593	aBigger:
3594	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3595	zExp = aExp;
3596	normalizeRoundAndPack:
3597	return
3598	normalizeRoundAndPackFloatx80(
3599	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3600
3601	}
3602
3603	/*----------------------------------------------------------------------------
3604	\| Returns the result of adding the extended double-precision floating-point
3605	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
3606	\| Standard for Binary Floating-Point Arithmetic.
3607	----------------------------------------------------------------------------/
3608
3609	floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3610	{
3611	flag aSign, bSign;
3612
3613	aSign = extractFloatx80Sign( a );
3614	bSign = extractFloatx80Sign( b );
3615	if ( aSign == bSign ) {
3616	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3617	}
3618	else {
3619	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3620	}
3621
3622	}
3623
3624	/*----------------------------------------------------------------------------
3625	\| Returns the result of subtracting the extended double-precision floating-
3626	\| point values `a' and `b'. The operation is performed according to the
3627	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628	----------------------------------------------------------------------------/
3629
3630	floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3631	{
3632	flag aSign, bSign;
3633
3634	aSign = extractFloatx80Sign( a );
3635	bSign = extractFloatx80Sign( b );
3636	if ( aSign == bSign ) {
3637	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3638	}
3639	else {
3640	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3641	}
3642
3643	}
3644
3645	/*----------------------------------------------------------------------------
3646	\| Returns the result of multiplying the extended double-precision floating-
3647	\| point values `a' and `b'. The operation is performed according to the
3648	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649	----------------------------------------------------------------------------/
3650
3651	floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3652	{
3653	flag aSign, bSign, zSign;
3654	int32 aExp, bExp, zExp;
3655	bits64 aSig, bSig, zSig0, zSig1;
3656	floatx80 z;
3657
3658	aSig = extractFloatx80Frac( a );
3659	aExp = extractFloatx80Exp( a );
3660	aSign = extractFloatx80Sign( a );
3661	bSig = extractFloatx80Frac( b );
3662	bExp = extractFloatx80Exp( b );
3663	bSign = extractFloatx80Sign( b );
3664	zSign = aSign ^ bSign;
3665	if ( aExp == 0x7FFF ) {
3666	if ( (bits64) ( aSig<<1 )
3667	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3668	return propagateFloatx80NaN( a, b STATUS_VAR );
3669	}
3670	if ( ( bExp \| bSig ) == 0 ) goto invalid;
3671	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672	}
3673	if ( bExp == 0x7FFF ) {
3674	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3675	if ( ( aExp \| aSig ) == 0 ) {
3676	invalid:
3677	float_raise( float_flag_invalid STATUS_VAR);
3678	z.low = floatx80_default_nan_low;
3679	z.high = floatx80_default_nan_high;
3680	return z;
3681	}
3682	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683	}
3684	if ( aExp == 0 ) {
3685	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3686	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3687	}
3688	if ( bExp == 0 ) {
3689	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3690	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3691	}
3692	zExp = aExp + bExp - 0x3FFE;
3693	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3694	if ( 0 < (sbits64) zSig0 ) {
3695	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3696	--zExp;
3697	}
3698	return
3699	roundAndPackFloatx80(
3700	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3701
3702	}
3703
3704	/*----------------------------------------------------------------------------
3705	\| Returns the result of dividing the extended double-precision floating-point
3706	\| value `a' by the corresponding value `b'. The operation is performed
3707	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708	----------------------------------------------------------------------------/
3709
3710	floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3711	{
3712	flag aSign, bSign, zSign;
3713	int32 aExp, bExp, zExp;
3714	bits64 aSig, bSig, zSig0, zSig1;
3715	bits64 rem0, rem1, rem2, term0, term1, term2;
3716	floatx80 z;
3717
3718	aSig = extractFloatx80Frac( a );
3719	aExp = extractFloatx80Exp( a );
3720	aSign = extractFloatx80Sign( a );
3721	bSig = extractFloatx80Frac( b );
3722	bExp = extractFloatx80Exp( b );
3723	bSign = extractFloatx80Sign( b );
3724	zSign = aSign ^ bSign;
3725	if ( aExp == 0x7FFF ) {
3726	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3727	if ( bExp == 0x7FFF ) {
3728	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3729	goto invalid;
3730	}
3731	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3732	}
3733	if ( bExp == 0x7FFF ) {
3734	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3735	return packFloatx80( zSign, 0, 0 );
3736	}
3737	if ( bExp == 0 ) {
3738	if ( bSig == 0 ) {
3739	if ( ( aExp \| aSig ) == 0 ) {
3740	invalid:
3741	float_raise( float_flag_invalid STATUS_VAR);
3742	z.low = floatx80_default_nan_low;
3743	z.high = floatx80_default_nan_high;
3744	return z;
3745	}
3746	float_raise( float_flag_divbyzero STATUS_VAR);
3747	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3748	}
3749	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3750	}
3751	if ( aExp == 0 ) {
3752	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3753	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3754	}
3755	zExp = aExp - bExp + 0x3FFE;
3756	rem1 = 0;
3757	if ( bSig <= aSig ) {
3758	shift128Right( aSig, 0, 1, &aSig, &rem1 );
3759	++zExp;
3760	}
3761	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3762	mul64To128( bSig, zSig0, &term0, &term1 );
3763	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3764	while ( (sbits64) rem0 < 0 ) {
3765	--zSig0;
3766	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3767	}
3768	zSig1 = estimateDiv128To64( rem1, 0, bSig );
3769	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3770	mul64To128( bSig, zSig1, &term1, &term2 );
3771	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3772	while ( (sbits64) rem1 < 0 ) {
3773	--zSig1;
3774	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3775	}
3776	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
3777	}
3778	return
3779	roundAndPackFloatx80(
3780	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3781
3782	}
3783
3784	/*----------------------------------------------------------------------------
3785	\| Returns the remainder of the extended double-precision floating-point value
3786	\| `a' with respect to the corresponding value `b'. The operation is performed
3787	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788	----------------------------------------------------------------------------/
3789
3790	floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3791	{
3792	flag aSign, bSign, zSign;
3793	int32 aExp, bExp, expDiff;
3794	bits64 aSig0, aSig1, bSig;
3795	bits64 q, term0, term1, alternateASig0, alternateASig1;
3796	floatx80 z;
3797
3798	aSig0 = extractFloatx80Frac( a );
3799	aExp = extractFloatx80Exp( a );
3800	aSign = extractFloatx80Sign( a );
3801	bSig = extractFloatx80Frac( b );
3802	bExp = extractFloatx80Exp( b );
3803	bSign = extractFloatx80Sign( b );
3804	if ( aExp == 0x7FFF ) {
3805	if ( (bits64) ( aSig0<<1 )
3806	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807	return propagateFloatx80NaN( a, b STATUS_VAR );
3808	}
3809	goto invalid;
3810	}
3811	if ( bExp == 0x7FFF ) {
3812	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3813	return a;
3814	}
3815	if ( bExp == 0 ) {
3816	if ( bSig == 0 ) {
3817	invalid:
3818	float_raise( float_flag_invalid STATUS_VAR);
3819	z.low = floatx80_default_nan_low;
3820	z.high = floatx80_default_nan_high;
3821	return z;
3822	}
3823	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3824	}
3825	if ( aExp == 0 ) {
3826	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3827	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3828	}
3829	bSig \|= LIT64( 0x8000000000000000 );
3830	zSign = aSign;
3831	expDiff = aExp - bExp;
3832	aSig1 = 0;
3833	if ( expDiff < 0 ) {
3834	if ( expDiff < -1 ) return a;
3835	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3836	expDiff = 0;
3837	}
3838	q = ( bSig <= aSig0 );
3839	if ( q ) aSig0 -= bSig;
3840	expDiff -= 64;
3841	while ( 0 < expDiff ) {
3842	q = estimateDiv128To64( aSig0, aSig1, bSig );
3843	q = ( 2 < q ) ? q - 2 : 0;
3844	mul64To128( bSig, q, &term0, &term1 );
3845	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3846	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3847	expDiff -= 62;
3848	}
3849	expDiff += 64;
3850	if ( 0 < expDiff ) {
3851	q = estimateDiv128To64( aSig0, aSig1, bSig );
3852	q = ( 2 < q ) ? q - 2 : 0;
3853	q >>= 64 - expDiff;
3854	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3855	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3856	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3857	while ( le128( term0, term1, aSig0, aSig1 ) ) {
3858	++q;
3859	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3860	}
3861	}
3862	else {
3863	term1 = 0;
3864	term0 = bSig;
3865	}
3866	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3867	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3868	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3869	&& ( q & 1 ) )
3870	) {
3871	aSig0 = alternateASig0;
3872	aSig1 = alternateASig1;
3873	zSign = ! zSign;
3874	}
3875	return
3876	normalizeRoundAndPackFloatx80(
3877	80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3878
3879	}
3880
3881	/*----------------------------------------------------------------------------
3882	\| Returns the square root of the extended double-precision floating-point
3883	\| value `a'. The operation is performed according to the IEC/IEEE Standard
3884	\| for Binary Floating-Point Arithmetic.
3885	----------------------------------------------------------------------------/
3886
3887	floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3888	{
3889	flag aSign;
3890	int32 aExp, zExp;
3891	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3892	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3893	floatx80 z;
3894
3895	aSig0 = extractFloatx80Frac( a );
3896	aExp = extractFloatx80Exp( a );
3897	aSign = extractFloatx80Sign( a );
3898	if ( aExp == 0x7FFF ) {
3899	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3900	if ( ! aSign ) return a;
3901	goto invalid;
3902	}
3903	if ( aSign ) {
3904	if ( ( aExp \| aSig0 ) == 0 ) return a;
3905	invalid:
3906	float_raise( float_flag_invalid STATUS_VAR);
3907	z.low = floatx80_default_nan_low;
3908	z.high = floatx80_default_nan_high;
3909	return z;
3910	}
3911	if ( aExp == 0 ) {
3912	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3913	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3914	}
3915	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3916	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3917	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3918	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3919	doubleZSig0 = zSig0<<1;
3920	mul64To128( zSig0, zSig0, &term0, &term1 );
3921	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3922	while ( (sbits64) rem0 < 0 ) {
3923	--zSig0;
3924	doubleZSig0 -= 2;
3925	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
3926	}
3927	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3928	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929	if ( zSig1 == 0 ) zSig1 = 1;
3930	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3931	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3932	mul64To128( zSig1, zSig1, &term2, &term3 );
3933	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3934	while ( (sbits64) rem1 < 0 ) {
3935	--zSig1;
3936	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3937	term3 \|= 1;
3938	term2 \|= doubleZSig0;
3939	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3940	}
3941	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
3942	}
3943	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3944	zSig0 \|= doubleZSig0;
3945	return
3946	roundAndPackFloatx80(
3947	STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3948
3949	}
3950
3951	/*----------------------------------------------------------------------------
3952	\| Returns 1 if the extended double-precision floating-point value `a' is
3953	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3954	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3955	\| Arithmetic.
3956	----------------------------------------------------------------------------/
3957
3958	int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3959	{
3960
3961	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3962	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3963	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3964	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3965	) {
3966	if ( floatx80_is_signaling_nan( a )
3967	\|\| floatx80_is_signaling_nan( b ) ) {
3968	float_raise( float_flag_invalid STATUS_VAR);
3969	}
3970	return 0;
3971	}
3972	return
3973	( a.low == b.low )
3974	&& ( ( a.high == b.high )
3975	\|\| ( ( a.low == 0 )
3976	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
3977	);
3978
3979	}
3980
3981	/*----------------------------------------------------------------------------
3982	\| Returns 1 if the extended double-precision floating-point value `a' is
3983	\| less than or equal to the corresponding value `b', and 0 otherwise. The
3984	\| comparison is performed according to the IEC/IEEE Standard for Binary
3985	\| Floating-Point Arithmetic.
3986	----------------------------------------------------------------------------/
3987
3988	int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3989	{
3990	flag aSign, bSign;
3991
3992	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3993	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
3995	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996	) {
3997	float_raise( float_flag_invalid STATUS_VAR);
3998	return 0;
3999	}
4000	aSign = extractFloatx80Sign( a );
4001	bSign = extractFloatx80Sign( b );
4002	if ( aSign != bSign ) {
4003	return
4004	aSign
4005	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4006	== 0 );
4007	}
4008	return
4009	aSign ? le128( b.high, b.low, a.high, a.low )
4010	: le128( a.high, a.low, b.high, b.low );
4011
4012	}
4013
4014	/*----------------------------------------------------------------------------
4015	\| Returns 1 if the extended double-precision floating-point value `a' is
4016	\| less than the corresponding value `b', and 0 otherwise. The comparison
4017	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4018	\| Arithmetic.
4019	----------------------------------------------------------------------------/
4020
4021	int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4022	{
4023	flag aSign, bSign;
4024
4025	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4026	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4027	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4028	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4029	) {
4030	float_raise( float_flag_invalid STATUS_VAR);
4031	return 0;
4032	}
4033	aSign = extractFloatx80Sign( a );
4034	bSign = extractFloatx80Sign( b );
4035	if ( aSign != bSign ) {
4036	return
4037	aSign
4038	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4039	!= 0 );
4040	}
4041	return
4042	aSign ? lt128( b.high, b.low, a.high, a.low )
4043	: lt128( a.high, a.low, b.high, b.low );
4044
4045	}
4046
4047	/*----------------------------------------------------------------------------
4048	\| Returns 1 if the extended double-precision floating-point value `a' is equal
4049	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
4050	\| raised if either operand is a NaN. Otherwise, the comparison is performed
4051	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052	----------------------------------------------------------------------------/
4053
4054	int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4055	{
4056
4057	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4058	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4059	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4060	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4061	) {
4062	float_raise( float_flag_invalid STATUS_VAR);
4063	return 0;
4064	}
4065	return
4066	( a.low == b.low )
4067	&& ( ( a.high == b.high )
4068	\|\| ( ( a.low == 0 )
4069	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4070	);
4071
4072	}
4073
4074	/*----------------------------------------------------------------------------
4075	\| Returns 1 if the extended double-precision floating-point value `a' is less
4076	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4077	\| do not cause an exception. Otherwise, the comparison is performed according
4078	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079	----------------------------------------------------------------------------/
4080
4081	int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4082	{
4083	flag aSign, bSign;
4084
4085	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4086	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4087	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4088	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4089	) {
4090	if ( floatx80_is_signaling_nan( a )
4091	\|\| floatx80_is_signaling_nan( b ) ) {
4092	float_raise( float_flag_invalid STATUS_VAR);
4093	}
4094	return 0;
4095	}
4096	aSign = extractFloatx80Sign( a );
4097	bSign = extractFloatx80Sign( b );
4098	if ( aSign != bSign ) {
4099	return
4100	aSign
4101	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4102	== 0 );
4103	}
4104	return
4105	aSign ? le128( b.high, b.low, a.high, a.low )
4106	: le128( a.high, a.low, b.high, b.low );
4107
4108	}
4109
4110	/*----------------------------------------------------------------------------
4111	\| Returns 1 if the extended double-precision floating-point value `a' is less
4112	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4113	\| an exception. Otherwise, the comparison is performed according to the
4114	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115	----------------------------------------------------------------------------/
4116
4117	int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4118	{
4119	flag aSign, bSign;
4120
4121	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4122	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4123	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4124	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4125	) {
4126	if ( floatx80_is_signaling_nan( a )
4127	\|\| floatx80_is_signaling_nan( b ) ) {
4128	float_raise( float_flag_invalid STATUS_VAR);
4129	}
4130	return 0;
4131	}
4132	aSign = extractFloatx80Sign( a );
4133	bSign = extractFloatx80Sign( b );
4134	if ( aSign != bSign ) {
4135	return
4136	aSign
4137	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4138	!= 0 );
4139	}
4140	return
4141	aSign ? lt128( b.high, b.low, a.high, a.low )
4142	: lt128( a.high, a.low, b.high, b.low );
4143
4144	}
4145
4146	#endif
4147
4148	#ifdef FLOAT128
4149
4150	/*----------------------------------------------------------------------------
4151	\| Returns the result of converting the quadruple-precision floating-point
4152	\| value `a' to the 32-bit two's complement integer format. The conversion
4153	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154	\| Arithmetic---which means in particular that the conversion is rounded
4155	\| according to the current rounding mode. If `a' is a NaN, the largest
4156	\| positive integer is returned. Otherwise, if the conversion overflows, the
4157	\| largest integer with the same sign as `a' is returned.
4158	----------------------------------------------------------------------------/
4159
4160	int32 float128_to_int32( float128 a STATUS_PARAM )
4161	{
4162	flag aSign;
4163	int32 aExp, shiftCount;
4164	bits64 aSig0, aSig1;
4165
4166	aSig1 = extractFloat128Frac1( a );
4167	aSig0 = extractFloat128Frac0( a );
4168	aExp = extractFloat128Exp( a );
4169	aSign = extractFloat128Sign( a );
4170	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
4171	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4172	aSig0 \|= ( aSig1 != 0 );
4173	shiftCount = 0x4028 - aExp;
4174	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4175	return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4176
4177	}
4178
4179	/*----------------------------------------------------------------------------
4180	\| Returns the result of converting the quadruple-precision floating-point
4181	\| value `a' to the 32-bit two's complement integer format. The conversion
4182	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183	\| Arithmetic, except that the conversion is always rounded toward zero. If
4184	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4185	\| conversion overflows, the largest integer with the same sign as `a' is
4186	\| returned.
4187	----------------------------------------------------------------------------/
4188
4189	int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4190	{
4191	flag aSign;
4192	int32 aExp, shiftCount;
4193	bits64 aSig0, aSig1, savedASig;
4194	int32 z;
4195
4196	aSig1 = extractFloat128Frac1( a );
4197	aSig0 = extractFloat128Frac0( a );
4198	aExp = extractFloat128Exp( a );
4199	aSign = extractFloat128Sign( a );
4200	aSig0 \|= ( aSig1 != 0 );
4201	if ( 0x401E < aExp ) {
4202	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4203	goto invalid;
4204	}
4205	else if ( aExp < 0x3FFF ) {
4206	if ( aExp \|\| aSig0 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4207	return 0;
4208	}
4209	aSig0 \|= LIT64( 0x0001000000000000 );
4210	shiftCount = 0x402F - aExp;
4211	savedASig = aSig0;
4212	aSig0 >>= shiftCount;
4213	z = aSig0;
4214	if ( aSign ) z = - z;
4215	if ( ( z < 0 ) ^ aSign ) {
4216	invalid:
4217	float_raise( float_flag_invalid STATUS_VAR);
4218	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4219	}
4220	if ( ( aSig0<<shiftCount ) != savedASig ) {
4221	STATUS(float_exception_flags) \|= float_flag_inexact;
4222	}
4223	return z;
4224
4225	}
4226
4227	/*----------------------------------------------------------------------------
4228	\| Returns the result of converting the quadruple-precision floating-point
4229	\| value `a' to the 64-bit two's complement integer format. The conversion
4230	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231	\| Arithmetic---which means in particular that the conversion is rounded
4232	\| according to the current rounding mode. If `a' is a NaN, the largest
4233	\| positive integer is returned. Otherwise, if the conversion overflows, the
4234	\| largest integer with the same sign as `a' is returned.
4235	----------------------------------------------------------------------------/
4236
4237	int64 float128_to_int64( float128 a STATUS_PARAM )
4238	{
4239	flag aSign;
4240	int32 aExp, shiftCount;
4241	bits64 aSig0, aSig1;
4242
4243	aSig1 = extractFloat128Frac1( a );
4244	aSig0 = extractFloat128Frac0( a );
4245	aExp = extractFloat128Exp( a );
4246	aSign = extractFloat128Sign( a );
4247	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4248	shiftCount = 0x402F - aExp;
4249	if ( shiftCount <= 0 ) {
4250	if ( 0x403E < aExp ) {
4251	float_raise( float_flag_invalid STATUS_VAR);
4252	if ( ! aSign
4253	\|\| ( ( aExp == 0x7FFF )
4254	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4255	)
4256	) {
4257	return LIT64( 0x7FFFFFFFFFFFFFFF );
4258	}
4259	return (sbits64) LIT64( 0x8000000000000000 );
4260	}
4261	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4262	}
4263	else {
4264	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4265	}
4266	return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4267
4268	}
4269
4270	/*----------------------------------------------------------------------------
4271	\| Returns the result of converting the quadruple-precision floating-point
4272	\| value `a' to the 64-bit two's complement integer format. The conversion
4273	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274	\| Arithmetic, except that the conversion is always rounded toward zero.
4275	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4276	\| the conversion overflows, the largest integer with the same sign as `a' is
4277	\| returned.
4278	----------------------------------------------------------------------------/
4279
4280	int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4281	{
4282	flag aSign;
4283	int32 aExp, shiftCount;
4284	bits64 aSig0, aSig1;
4285	int64 z;
4286
4287	aSig1 = extractFloat128Frac1( a );
4288	aSig0 = extractFloat128Frac0( a );
4289	aExp = extractFloat128Exp( a );
4290	aSign = extractFloat128Sign( a );
4291	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4292	shiftCount = aExp - 0x402F;
4293	if ( 0 < shiftCount ) {
4294	if ( 0x403E <= aExp ) {
4295	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4296	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4297	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4298	if ( aSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4299	}
4300	else {
4301	float_raise( float_flag_invalid STATUS_VAR);
4302	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
4303	return LIT64( 0x7FFFFFFFFFFFFFFF );
4304	}
4305	}
4306	return (sbits64) LIT64( 0x8000000000000000 );
4307	}
4308	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
4309	if ( (bits64) ( aSig1<<shiftCount ) ) {
4310	STATUS(float_exception_flags) \|= float_flag_inexact;
4311	}
4312	}
4313	else {
4314	if ( aExp < 0x3FFF ) {
4315	if ( aExp \| aSig0 \| aSig1 ) {
4316	STATUS(float_exception_flags) \|= float_flag_inexact;
4317	}
4318	return 0;
4319	}
4320	z = aSig0>>( - shiftCount );
4321	if ( aSig1
4322	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4323	STATUS(float_exception_flags) \|= float_flag_inexact;
4324	}
4325	}
4326	if ( aSign ) z = - z;
4327	return z;
4328
4329	}
4330
4331	/*----------------------------------------------------------------------------
4332	\| Returns the result of converting the quadruple-precision floating-point
4333	\| value `a' to the single-precision floating-point format. The conversion
4334	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335	\| Arithmetic.
4336	----------------------------------------------------------------------------/
4337
4338	float32 float128_to_float32( float128 a STATUS_PARAM )
4339	{
4340	flag aSign;
4341	int32 aExp;
4342	bits64 aSig0, aSig1;
4343	bits32 zSig;
4344
4345	aSig1 = extractFloat128Frac1( a );
4346	aSig0 = extractFloat128Frac0( a );
4347	aExp = extractFloat128Exp( a );
4348	aSign = extractFloat128Sign( a );
4349	if ( aExp == 0x7FFF ) {
4350	if ( aSig0 \| aSig1 ) {
4351	return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4352	}
4353	return packFloat32( aSign, 0xFF, 0 );
4354	}
4355	aSig0 \|= ( aSig1 != 0 );
4356	shift64RightJamming( aSig0, 18, &aSig0 );
4357	zSig = aSig0;
4358	if ( aExp \|\| zSig ) {
4359	zSig \|= 0x40000000;
4360	aExp -= 0x3F81;
4361	}
4362	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4363
4364	}
4365
4366	/*----------------------------------------------------------------------------
4367	\| Returns the result of converting the quadruple-precision floating-point
4368	\| value `a' to the double-precision floating-point format. The conversion
4369	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4370	\| Arithmetic.
4371	----------------------------------------------------------------------------/
4372
4373	float64 float128_to_float64( float128 a STATUS_PARAM )
4374	{
4375	flag aSign;
4376	int32 aExp;
4377	bits64 aSig0, aSig1;
4378
4379	aSig1 = extractFloat128Frac1( a );
4380	aSig0 = extractFloat128Frac0( a );
4381	aExp = extractFloat128Exp( a );
4382	aSign = extractFloat128Sign( a );
4383	if ( aExp == 0x7FFF ) {
4384	if ( aSig0 \| aSig1 ) {
4385	return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4386	}
4387	return packFloat64( aSign, 0x7FF, 0 );
4388	}
4389	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4390	aSig0 \|= ( aSig1 != 0 );
4391	if ( aExp \|\| aSig0 ) {
4392	aSig0 \|= LIT64( 0x4000000000000000 );
4393	aExp -= 0x3C01;
4394	}
4395	return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4396
4397	}
4398
4399	#ifdef FLOATX80
4400
4401	/*----------------------------------------------------------------------------
4402	\| Returns the result of converting the quadruple-precision floating-point
4403	\| value `a' to the extended double-precision floating-point format. The
4404	\| conversion is performed according to the IEC/IEEE Standard for Binary
4405	\| Floating-Point Arithmetic.
4406	----------------------------------------------------------------------------/
4407
4408	floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4409	{
4410	flag aSign;
4411	int32 aExp;
4412	bits64 aSig0, aSig1;
4413
4414	aSig1 = extractFloat128Frac1( a );
4415	aSig0 = extractFloat128Frac0( a );
4416	aExp = extractFloat128Exp( a );
4417	aSign = extractFloat128Sign( a );
4418	if ( aExp == 0x7FFF ) {
4419	if ( aSig0 \| aSig1 ) {
4420	return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4421	}
4422	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4423	}
4424	if ( aExp == 0 ) {
4425	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4426	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4427	}
4428	else {
4429	aSig0 \|= LIT64( 0x0001000000000000 );
4430	}
4431	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4432	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4433
4434	}
4435
4436	#endif
4437
4438	/*----------------------------------------------------------------------------
4439	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
4440	\| returns the result as a quadruple-precision floating-point value. The
4441	\| operation is performed according to the IEC/IEEE Standard for Binary
4442	\| Floating-Point Arithmetic.
4443	----------------------------------------------------------------------------/
4444
4445	float128 float128_round_to_int( float128 a STATUS_PARAM )
4446	{
4447	flag aSign;
4448	int32 aExp;
4449	bits64 lastBitMask, roundBitsMask;
4450	int8 roundingMode;
4451	float128 z;
4452
4453	aExp = extractFloat128Exp( a );
4454	if ( 0x402F <= aExp ) {
4455	if ( 0x406F <= aExp ) {
4456	if ( ( aExp == 0x7FFF )
4457	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
4458	) {
4459	return propagateFloat128NaN( a, a STATUS_VAR );
4460	}
4461	return a;
4462	}
4463	lastBitMask = 1;
4464	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4465	roundBitsMask = lastBitMask - 1;
4466	z = a;
4467	roundingMode = STATUS(float_rounding_mode);
4468	if ( roundingMode == float_round_nearest_even ) {
4469	if ( lastBitMask ) {
4470	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4471	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4472	}
4473	else {
4474	if ( (sbits64) z.low < 0 ) {
4475	++z.high;
4476	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4477	}
4478	}
4479	}
4480	else if ( roundingMode != float_round_to_zero ) {
4481	if ( extractFloat128Sign( z )
4482	^ ( roundingMode == float_round_up ) ) {
4483	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4484	}
4485	}
4486	z.low &= ~ roundBitsMask;
4487	}
4488	else {
4489	if ( aExp < 0x3FFF ) {
4490	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
4491	STATUS(float_exception_flags) \|= float_flag_inexact;
4492	aSign = extractFloat128Sign( a );
4493	switch ( STATUS(float_rounding_mode) ) {
4494	case float_round_nearest_even:
4495	if ( ( aExp == 0x3FFE )
4496	&& ( extractFloat128Frac0( a )
4497	\| extractFloat128Frac1( a ) )
4498	) {
4499	return packFloat128( aSign, 0x3FFF, 0, 0 );
4500	}
4501	break;
4502	case float_round_down:
4503	return
4504	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4505	: packFloat128( 0, 0, 0, 0 );
4506	case float_round_up:
4507	return
4508	aSign ? packFloat128( 1, 0, 0, 0 )
4509	: packFloat128( 0, 0x3FFF, 0, 0 );
4510	}
4511	return packFloat128( aSign, 0, 0, 0 );
4512	}
4513	lastBitMask = 1;
4514	lastBitMask <<= 0x402F - aExp;
4515	roundBitsMask = lastBitMask - 1;
4516	z.low = 0;
4517	z.high = a.high;
4518	roundingMode = STATUS(float_rounding_mode);
4519	if ( roundingMode == float_round_nearest_even ) {
4520	z.high += lastBitMask>>1;
4521	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
4522	z.high &= ~ lastBitMask;
4523	}
4524	}
4525	else if ( roundingMode != float_round_to_zero ) {
4526	if ( extractFloat128Sign( z )
4527	^ ( roundingMode == float_round_up ) ) {
4528	z.high \|= ( a.low != 0 );
4529	z.high += roundBitsMask;
4530	}
4531	}
4532	z.high &= ~ roundBitsMask;
4533	}
4534	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
4535	STATUS(float_exception_flags) \|= float_flag_inexact;
4536	}
4537	return z;
4538
4539	}
4540
4541	/*----------------------------------------------------------------------------
4542	\| Returns the result of adding the absolute values of the quadruple-precision
4543	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4544	\| before being returned. `zSign' is ignored if the result is a NaN.
4545	\| The addition is performed according to the IEC/IEEE Standard for Binary
4546	\| Floating-Point Arithmetic.
4547	----------------------------------------------------------------------------/
4548
4549	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4550	{
4551	int32 aExp, bExp, zExp;
4552	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4553	int32 expDiff;
4554
4555	aSig1 = extractFloat128Frac1( a );
4556	aSig0 = extractFloat128Frac0( a );
4557	aExp = extractFloat128Exp( a );
4558	bSig1 = extractFloat128Frac1( b );
4559	bSig0 = extractFloat128Frac0( b );
4560	bExp = extractFloat128Exp( b );
4561	expDiff = aExp - bExp;
4562	if ( 0 < expDiff ) {
4563	if ( aExp == 0x7FFF ) {
4564	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4565	return a;
4566	}
4567	if ( bExp == 0 ) {
4568	--expDiff;
4569	}
4570	else {
4571	bSig0 \|= LIT64( 0x0001000000000000 );
4572	}
4573	shift128ExtraRightJamming(
4574	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4575	zExp = aExp;
4576	}
4577	else if ( expDiff < 0 ) {
4578	if ( bExp == 0x7FFF ) {
4579	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4580	return packFloat128( zSign, 0x7FFF, 0, 0 );
4581	}
4582	if ( aExp == 0 ) {
4583	++expDiff;
4584	}
4585	else {
4586	aSig0 \|= LIT64( 0x0001000000000000 );
4587	}
4588	shift128ExtraRightJamming(
4589	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4590	zExp = bExp;
4591	}
4592	else {
4593	if ( aExp == 0x7FFF ) {
4594	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4595	return propagateFloat128NaN( a, b STATUS_VAR );
4596	}
4597	return a;
4598	}
4599	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4600	if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4601	zSig2 = 0;
4602	zSig0 \|= LIT64( 0x0002000000000000 );
4603	zExp = aExp;
4604	goto shiftRight1;
4605	}
4606	aSig0 \|= LIT64( 0x0001000000000000 );
4607	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4608	--zExp;
4609	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4610	++zExp;
4611	shiftRight1:
4612	shift128ExtraRightJamming(
4613	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4614	roundAndPack:
4615	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4616
4617	}
4618
4619	/*----------------------------------------------------------------------------
4620	\| Returns the result of subtracting the absolute values of the quadruple-
4621	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
4622	\| difference is negated before being returned. `zSign' is ignored if the
4623	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
4624	\| Standard for Binary Floating-Point Arithmetic.
4625	----------------------------------------------------------------------------/
4626
4627	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4628	{
4629	int32 aExp, bExp, zExp;
4630	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4631	int32 expDiff;
4632	float128 z;
4633
4634	aSig1 = extractFloat128Frac1( a );
4635	aSig0 = extractFloat128Frac0( a );
4636	aExp = extractFloat128Exp( a );
4637	bSig1 = extractFloat128Frac1( b );
4638	bSig0 = extractFloat128Frac0( b );
4639	bExp = extractFloat128Exp( b );
4640	expDiff = aExp - bExp;
4641	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4642	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4643	if ( 0 < expDiff ) goto aExpBigger;
4644	if ( expDiff < 0 ) goto bExpBigger;
4645	if ( aExp == 0x7FFF ) {
4646	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4647	return propagateFloat128NaN( a, b STATUS_VAR );
4648	}
4649	float_raise( float_flag_invalid STATUS_VAR);
4650	z.low = float128_default_nan_low;
4651	z.high = float128_default_nan_high;
4652	return z;
4653	}
4654	if ( aExp == 0 ) {
4655	aExp = 1;
4656	bExp = 1;
4657	}
4658	if ( bSig0 < aSig0 ) goto aBigger;
4659	if ( aSig0 < bSig0 ) goto bBigger;
4660	if ( bSig1 < aSig1 ) goto aBigger;
4661	if ( aSig1 < bSig1 ) goto bBigger;
4662	return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4663	bExpBigger:
4664	if ( bExp == 0x7FFF ) {
4665	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4666	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4667	}
4668	if ( aExp == 0 ) {
4669	++expDiff;
4670	}
4671	else {
4672	aSig0 \|= LIT64( 0x4000000000000000 );
4673	}
4674	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4675	bSig0 \|= LIT64( 0x4000000000000000 );
4676	bBigger:
4677	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4678	zExp = bExp;
4679	zSign ^= 1;
4680	goto normalizeRoundAndPack;
4681	aExpBigger:
4682	if ( aExp == 0x7FFF ) {
4683	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4684	return a;
4685	}
4686	if ( bExp == 0 ) {
4687	--expDiff;
4688	}
4689	else {
4690	bSig0 \|= LIT64( 0x4000000000000000 );
4691	}
4692	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4693	aSig0 \|= LIT64( 0x4000000000000000 );
4694	aBigger:
4695	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4696	zExp = aExp;
4697	normalizeRoundAndPack:
4698	--zExp;
4699	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4700
4701	}
4702
4703	/*----------------------------------------------------------------------------
4704	\| Returns the result of adding the quadruple-precision floating-point values
4705	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4706	\| for Binary Floating-Point Arithmetic.
4707	----------------------------------------------------------------------------/
4708
4709	float128 float128_add( float128 a, float128 b STATUS_PARAM )
4710	{
4711	flag aSign, bSign;
4712
4713	aSign = extractFloat128Sign( a );
4714	bSign = extractFloat128Sign( b );
4715	if ( aSign == bSign ) {
4716	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4717	}
4718	else {
4719	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4720	}
4721
4722	}
4723
4724	/*----------------------------------------------------------------------------
4725	\| Returns the result of subtracting the quadruple-precision floating-point
4726	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4727	\| Standard for Binary Floating-Point Arithmetic.
4728	----------------------------------------------------------------------------/
4729
4730	float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4731	{
4732	flag aSign, bSign;
4733
4734	aSign = extractFloat128Sign( a );
4735	bSign = extractFloat128Sign( b );
4736	if ( aSign == bSign ) {
4737	return subFloat128Sigs( a, b, aSign STATUS_VAR );
4738	}
4739	else {
4740	return addFloat128Sigs( a, b, aSign STATUS_VAR );
4741	}
4742
4743	}
4744
4745	/*----------------------------------------------------------------------------
4746	\| Returns the result of multiplying the quadruple-precision floating-point
4747	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
4748	\| Standard for Binary Floating-Point Arithmetic.
4749	----------------------------------------------------------------------------/
4750
4751	float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4752	{
4753	flag aSign, bSign, zSign;
4754	int32 aExp, bExp, zExp;
4755	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4756	float128 z;
4757
4758	aSig1 = extractFloat128Frac1( a );
4759	aSig0 = extractFloat128Frac0( a );
4760	aExp = extractFloat128Exp( a );
4761	aSign = extractFloat128Sign( a );
4762	bSig1 = extractFloat128Frac1( b );
4763	bSig0 = extractFloat128Frac0( b );
4764	bExp = extractFloat128Exp( b );
4765	bSign = extractFloat128Sign( b );
4766	zSign = aSign ^ bSign;
4767	if ( aExp == 0x7FFF ) {
4768	if ( ( aSig0 \| aSig1 )
4769	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4770	return propagateFloat128NaN( a, b STATUS_VAR );
4771	}
4772	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
4773	return packFloat128( zSign, 0x7FFF, 0, 0 );
4774	}
4775	if ( bExp == 0x7FFF ) {
4776	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4778	invalid:
4779	float_raise( float_flag_invalid STATUS_VAR);
4780	z.low = float128_default_nan_low;
4781	z.high = float128_default_nan_high;
4782	return z;
4783	}
4784	return packFloat128( zSign, 0x7FFF, 0, 0 );
4785	}
4786	if ( aExp == 0 ) {
4787	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4788	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4789	}
4790	if ( bExp == 0 ) {
4791	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4792	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4793	}
4794	zExp = aExp + bExp - 0x4000;
4795	aSig0 \|= LIT64( 0x0001000000000000 );
4796	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4797	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4798	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4799	zSig2 \|= ( zSig3 != 0 );
4800	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4801	shift128ExtraRightJamming(
4802	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4803	++zExp;
4804	}
4805	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4806
4807	}
4808
4809	/*----------------------------------------------------------------------------
4810	\| Returns the result of dividing the quadruple-precision floating-point value
4811	\| `a' by the corresponding value `b'. The operation is performed according to
4812	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813	----------------------------------------------------------------------------/
4814
4815	float128 float128_div( float128 a, float128 b STATUS_PARAM )
4816	{
4817	flag aSign, bSign, zSign;
4818	int32 aExp, bExp, zExp;
4819	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4820	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4821	float128 z;
4822
4823	aSig1 = extractFloat128Frac1( a );
4824	aSig0 = extractFloat128Frac0( a );
4825	aExp = extractFloat128Exp( a );
4826	aSign = extractFloat128Sign( a );
4827	bSig1 = extractFloat128Frac1( b );
4828	bSig0 = extractFloat128Frac0( b );
4829	bExp = extractFloat128Exp( b );
4830	bSign = extractFloat128Sign( b );
4831	zSign = aSign ^ bSign;
4832	if ( aExp == 0x7FFF ) {
4833	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4834	if ( bExp == 0x7FFF ) {
4835	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4836	goto invalid;
4837	}
4838	return packFloat128( zSign, 0x7FFF, 0, 0 );
4839	}
4840	if ( bExp == 0x7FFF ) {
4841	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4842	return packFloat128( zSign, 0, 0, 0 );
4843	}
4844	if ( bExp == 0 ) {
4845	if ( ( bSig0 \| bSig1 ) == 0 ) {
4846	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
4847	invalid:
4848	float_raise( float_flag_invalid STATUS_VAR);
4849	z.low = float128_default_nan_low;
4850	z.high = float128_default_nan_high;
4851	return z;
4852	}
4853	float_raise( float_flag_divbyzero STATUS_VAR);
4854	return packFloat128( zSign, 0x7FFF, 0, 0 );
4855	}
4856	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857	}
4858	if ( aExp == 0 ) {
4859	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4861	}
4862	zExp = aExp - bExp + 0x3FFD;
4863	shortShift128Left(
4864	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4865	shortShift128Left(
4866	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4867	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4868	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4869	++zExp;
4870	}
4871	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4872	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4873	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4874	while ( (sbits64) rem0 < 0 ) {
4875	--zSig0;
4876	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4877	}
4878	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4879	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4880	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4881	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4882	while ( (sbits64) rem1 < 0 ) {
4883	--zSig1;
4884	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4885	}
4886	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4887	}
4888	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4889	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4890
4891	}
4892
4893	/*----------------------------------------------------------------------------
4894	\| Returns the remainder of the quadruple-precision floating-point value `a'
4895	\| with respect to the corresponding value `b'. The operation is performed
4896	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897	----------------------------------------------------------------------------/
4898
4899	float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4900	{
4901	flag aSign, bSign, zSign;
4902	int32 aExp, bExp, expDiff;
4903	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4904	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4905	sbits64 sigMean0;
4906	float128 z;
4907
4908	aSig1 = extractFloat128Frac1( a );
4909	aSig0 = extractFloat128Frac0( a );
4910	aExp = extractFloat128Exp( a );
4911	aSign = extractFloat128Sign( a );
4912	bSig1 = extractFloat128Frac1( b );
4913	bSig0 = extractFloat128Frac0( b );
4914	bExp = extractFloat128Exp( b );
4915	bSign = extractFloat128Sign( b );
4916	if ( aExp == 0x7FFF ) {
4917	if ( ( aSig0 \| aSig1 )
4918	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
4919	return propagateFloat128NaN( a, b STATUS_VAR );
4920	}
4921	goto invalid;
4922	}
4923	if ( bExp == 0x7FFF ) {
4924	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4925	return a;
4926	}
4927	if ( bExp == 0 ) {
4928	if ( ( bSig0 \| bSig1 ) == 0 ) {
4929	invalid:
4930	float_raise( float_flag_invalid STATUS_VAR);
4931	z.low = float128_default_nan_low;
4932	z.high = float128_default_nan_high;
4933	return z;
4934	}
4935	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4936	}
4937	if ( aExp == 0 ) {
4938	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
4939	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4940	}
4941	expDiff = aExp - bExp;
4942	if ( expDiff < -1 ) return a;
4943	shortShift128Left(
4944	aSig0 \| LIT64( 0x0001000000000000 ),
4945	aSig1,
4946	15 - ( expDiff < 0 ),
4947	&aSig0,
4948	&aSig1
4949	);
4950	shortShift128Left(
4951	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4952	q = le128( bSig0, bSig1, aSig0, aSig1 );
4953	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4954	expDiff -= 64;
4955	while ( 0 < expDiff ) {
4956	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4957	q = ( 4 < q ) ? q - 4 : 0;
4958	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4959	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4960	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4961	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4962	expDiff -= 61;
4963	}
4964	if ( -64 < expDiff ) {
4965	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4966	q = ( 4 < q ) ? q - 4 : 0;
4967	q >>= - expDiff;
4968	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4969	expDiff += 52;
4970	if ( expDiff < 0 ) {
4971	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4972	}
4973	else {
4974	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4975	}
4976	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4977	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4978	}
4979	else {
4980	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4981	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4982	}
4983	do {
4984	alternateASig0 = aSig0;
4985	alternateASig1 = aSig1;
4986	++q;
4987	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4988	} while ( 0 <= (sbits64) aSig0 );
4989	add128(
4990	aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4991	if ( ( sigMean0 < 0 )
4992	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4993	aSig0 = alternateASig0;
4994	aSig1 = alternateASig1;
4995	}
4996	zSign = ( (sbits64) aSig0 < 0 );
4997	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4998	return
4999	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5000
5001	}
5002
5003	/*----------------------------------------------------------------------------
5004	\| Returns the square root of the quadruple-precision floating-point value `a'.
5005	\| The operation is performed according to the IEC/IEEE Standard for Binary
5006	\| Floating-Point Arithmetic.
5007	----------------------------------------------------------------------------/
5008
5009	float128 float128_sqrt( float128 a STATUS_PARAM )
5010	{
5011	flag aSign;
5012	int32 aExp, zExp;
5013	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5014	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5015	float128 z;
5016
5017	aSig1 = extractFloat128Frac1( a );
5018	aSig0 = extractFloat128Frac0( a );
5019	aExp = extractFloat128Exp( a );
5020	aSign = extractFloat128Sign( a );
5021	if ( aExp == 0x7FFF ) {
5022	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5023	if ( ! aSign ) return a;
5024	goto invalid;
5025	}
5026	if ( aSign ) {
5027	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
5028	invalid:
5029	float_raise( float_flag_invalid STATUS_VAR);
5030	z.low = float128_default_nan_low;
5031	z.high = float128_default_nan_high;
5032	return z;
5033	}
5034	if ( aExp == 0 ) {
5035	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5037	}
5038	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5039	aSig0 \|= LIT64( 0x0001000000000000 );
5040	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5041	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5042	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5043	doubleZSig0 = zSig0<<1;
5044	mul64To128( zSig0, zSig0, &term0, &term1 );
5045	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5046	while ( (sbits64) rem0 < 0 ) {
5047	--zSig0;
5048	doubleZSig0 -= 2;
5049	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
5050	}
5051	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5052	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5053	if ( zSig1 == 0 ) zSig1 = 1;
5054	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5055	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5056	mul64To128( zSig1, zSig1, &term2, &term3 );
5057	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5058	while ( (sbits64) rem1 < 0 ) {
5059	--zSig1;
5060	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5061	term3 \|= 1;
5062	term2 \|= doubleZSig0;
5063	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5064	}
5065	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
5066	}
5067	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5068	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5069
5070	}
5071
5072	/*----------------------------------------------------------------------------
5073	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5075	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076	----------------------------------------------------------------------------/
5077
5078	int float128_eq( float128 a, float128 b STATUS_PARAM )
5079	{
5080
5081	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5082	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5083	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5084	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5085	) {
5086	if ( float128_is_signaling_nan( a )
5087	\|\| float128_is_signaling_nan( b ) ) {
5088	float_raise( float_flag_invalid STATUS_VAR);
5089	}
5090	return 0;
5091	}
5092	return
5093	( a.low == b.low )
5094	&& ( ( a.high == b.high )
5095	\|\| ( ( a.low == 0 )
5096	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5097	);
5098
5099	}
5100
5101	/*----------------------------------------------------------------------------
5102	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5103	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
5104	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5105	\| Arithmetic.
5106	----------------------------------------------------------------------------/
5107
5108	int float128_le( float128 a, float128 b STATUS_PARAM )
5109	{
5110	flag aSign, bSign;
5111
5112	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5113	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5114	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5115	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5116	) {
5117	float_raise( float_flag_invalid STATUS_VAR);
5118	return 0;
5119	}
5120	aSign = extractFloat128Sign( a );
5121	bSign = extractFloat128Sign( b );
5122	if ( aSign != bSign ) {
5123	return
5124	aSign
5125	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5126	== 0 );
5127	}
5128	return
5129	aSign ? le128( b.high, b.low, a.high, a.low )
5130	: le128( a.high, a.low, b.high, b.low );
5131
5132	}
5133
5134	/*----------------------------------------------------------------------------
5135	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5136	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5137	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138	----------------------------------------------------------------------------/
5139
5140	int float128_lt( float128 a, float128 b STATUS_PARAM )
5141	{
5142	flag aSign, bSign;
5143
5144	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5145	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5146	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5147	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5148	) {
5149	float_raise( float_flag_invalid STATUS_VAR);
5150	return 0;
5151	}
5152	aSign = extractFloat128Sign( a );
5153	bSign = extractFloat128Sign( b );
5154	if ( aSign != bSign ) {
5155	return
5156	aSign
5157	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5158	!= 0 );
5159	}
5160	return
5161	aSign ? lt128( b.high, b.low, a.high, a.low )
5162	: lt128( a.high, a.low, b.high, b.low );
5163
5164	}
5165
5166	/*----------------------------------------------------------------------------
5167	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168	\| the corresponding value `b', and 0 otherwise. The invalid exception is
5169	\| raised if either operand is a NaN. Otherwise, the comparison is performed
5170	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171	----------------------------------------------------------------------------/
5172
5173	int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5174	{
5175
5176	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5177	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5178	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5179	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5180	) {
5181	float_raise( float_flag_invalid STATUS_VAR);
5182	return 0;
5183	}
5184	return
5185	( a.low == b.low )
5186	&& ( ( a.high == b.high )
5187	\|\| ( ( a.low == 0 )
5188	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5189	);
5190
5191	}
5192
5193	/*----------------------------------------------------------------------------
5194	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5195	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5196	\| cause an exception. Otherwise, the comparison is performed according to the
5197	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198	----------------------------------------------------------------------------/
5199
5200	int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5201	{
5202	flag aSign, bSign;
5203
5204	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5205	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5206	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5207	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5208	) {
5209	if ( float128_is_signaling_nan( a )
5210	\|\| float128_is_signaling_nan( b ) ) {
5211	float_raise( float_flag_invalid STATUS_VAR);
5212	}
5213	return 0;
5214	}
5215	aSign = extractFloat128Sign( a );
5216	bSign = extractFloat128Sign( b );
5217	if ( aSign != bSign ) {
5218	return
5219	aSign
5220	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5221	== 0 );
5222	}
5223	return
5224	aSign ? le128( b.high, b.low, a.high, a.low )
5225	: le128( a.high, a.low, b.high, b.low );
5226
5227	}
5228
5229	/*----------------------------------------------------------------------------
5230	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5231	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5232	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5233	\| Standard for Binary Floating-Point Arithmetic.
5234	----------------------------------------------------------------------------/
5235
5236	int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5237	{
5238	flag aSign, bSign;
5239
5240	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5241	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5242	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5243	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5244	) {
5245	if ( float128_is_signaling_nan( a )
5246	\|\| float128_is_signaling_nan( b ) ) {
5247	float_raise( float_flag_invalid STATUS_VAR);
5248	}
5249	return 0;
5250	}
5251	aSign = extractFloat128Sign( a );
5252	bSign = extractFloat128Sign( b );
5253	if ( aSign != bSign ) {
5254	return
5255	aSign
5256	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5257	!= 0 );
5258	}
5259	return
5260	aSign ? lt128( b.high, b.low, a.high, a.low )
5261	: lt128( a.high, a.low, b.high, b.low );
5262
5263	}
5264
5265	#endif
5266
5267	/* misc functions */
5268	float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5269	{
5270	return int64_to_float32(a STATUS_VAR);
5271	}
5272
5273	float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5274	{
5275	return int64_to_float64(a STATUS_VAR);
5276	}
5277
5278	unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5279	{
5280	int64_t v;
5281	unsigned int res;
5282
5283	v = float32_to_int64(a STATUS_VAR);
5284	if (v < 0) {
5285	res = 0;
5286	float_raise( float_flag_invalid STATUS_VAR);
5287	} else if (v > 0xffffffff) {
5288	res = 0xffffffff;
5289	float_raise( float_flag_invalid STATUS_VAR);
5290	} else {
5291	res = v;
5292	}
5293	return res;
5294	}
5295
5296	unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5297	{
5298	int64_t v;
5299	unsigned int res;
5300
5301	v = float32_to_int64_round_to_zero(a STATUS_VAR);
5302	if (v < 0) {
5303	res = 0;
5304	float_raise( float_flag_invalid STATUS_VAR);
5305	} else if (v > 0xffffffff) {
5306	res = 0xffffffff;
5307	float_raise( float_flag_invalid STATUS_VAR);
5308	} else {
5309	res = v;
5310	}
5311	return res;
5312	}
5313
5314	unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5315	{
5316	int64_t v;
5317	unsigned int res;
5318
5319	v = float64_to_int64(a STATUS_VAR);
5320	if (v < 0) {
5321	res = 0;
5322	float_raise( float_flag_invalid STATUS_VAR);
5323	} else if (v > 0xffffffff) {
5324	res = 0xffffffff;
5325	float_raise( float_flag_invalid STATUS_VAR);
5326	} else {
5327	res = v;
5328	}
5329	return res;
5330	}
5331
5332	unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5333	{
5334	int64_t v;
5335	unsigned int res;
5336
5337	v = float64_to_int64_round_to_zero(a STATUS_VAR);
5338	if (v < 0) {
5339	res = 0;
5340	float_raise( float_flag_invalid STATUS_VAR);
5341	} else if (v > 0xffffffff) {
5342	res = 0xffffffff;
5343	float_raise( float_flag_invalid STATUS_VAR);
5344	} else {
5345	res = v;
5346	}
5347	return res;
5348	}
5349
5350	/* FIXME: This looks broken. */
5351	uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5352	{
5353	int64_t v;
5354
5355	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5356	v += float64_val(a);
5357	v = float64_to_int64(make_float64(v) STATUS_VAR);
5358
5359	return v - INT64_MIN;
5360	}
5361
5362	uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5363	{
5364	int64_t v;
5365
5366	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5367	v += float64_val(a);
5368	v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5369
5370	return v - INT64_MIN;
5371	}
5372
5373	#define COMPARE(s, nan_exp) \
5374	INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5375	int is_quiet STATUS_PARAM ) \
5376	{ \
5377	flag aSign, bSign; \
5378	bits ## s av, bv; \
5379	\
5380	if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5381	extractFloat ## s ## Frac( a ) ) \|\| \
5382	( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5383	extractFloat ## s ## Frac( b ) )) { \
5384	if (!is_quiet \|\| \
5385	float ## s ## _is_signaling_nan( a ) \|\| \
5386	float ## s ## _is_signaling_nan( b ) ) { \
5387	float_raise( float_flag_invalid STATUS_VAR); \
5388	} \
5389	return float_relation_unordered; \
5390	} \
5391	aSign = extractFloat ## s ## Sign( a ); \
5392	bSign = extractFloat ## s ## Sign( b ); \
5393	av = float ## s ## _val(a); \
5394	bv = float ## s ## _val(b); \
5395	if ( aSign != bSign ) { \
5396	if ( (bits ## s) ( ( av \| bv )<<1 ) == 0 ) { \
5397	/* zero case */ \
5398	return float_relation_equal; \
5399	} else { \
5400	return 1 - (2 * aSign); \
5401	} \
5402	} else { \
5403	if (av == bv) { \
5404	return float_relation_equal; \
5405	} else { \
5406	return 1 - 2 * (aSign ^ ( av < bv )); \
5407	} \
5408	} \
5409	} \
5410	\
5411	int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5412	{ \
5413	return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5414	} \
5415	\
5416	int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5417	{ \
5418	return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5419	}
5420
5421	COMPARE(32, 0xff)
5422	COMPARE(64, 0x7ff)
5423
5424	INLINE int float128_compare_internal( float128 a, float128 b,
5425	int is_quiet STATUS_PARAM )
5426	{
5427	flag aSign, bSign;
5428
5429	if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5430	( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) ) \|\|
5431	( ( extractFloat128Exp( b ) == 0x7fff ) &&
5432	( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )) {
5433	if (!is_quiet \|\|
5434	float128_is_signaling_nan( a ) \|\|
5435	float128_is_signaling_nan( b ) ) {
5436	float_raise( float_flag_invalid STATUS_VAR);
5437	}
5438	return float_relation_unordered;
5439	}
5440	aSign = extractFloat128Sign( a );
5441	bSign = extractFloat128Sign( b );
5442	if ( aSign != bSign ) {
5443	if ( ( ( ( a.high \| b.high )<<1 ) \| a.low \| b.low ) == 0 ) {
5444	/* zero case */
5445	return float_relation_equal;
5446	} else {
5447	return 1 - (2 * aSign);
5448	}
5449	} else {
5450	if (a.low == b.low && a.high == b.high) {
5451	return float_relation_equal;
5452	} else {
5453	return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5454	}
5455	}
5456	}
5457
5458	int float128_compare( float128 a, float128 b STATUS_PARAM )
5459	{
5460	return float128_compare_internal(a, b, 0 STATUS_VAR);
5461	}
5462
5463	int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5464	{
5465	return float128_compare_internal(a, b, 1 STATUS_VAR);
5466	}
5467
5468	/* Multiply A by 2 raised to the power N. */
5469	float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5470	{
5471	flag aSign;
5472	int16 aExp;
5473	bits32 aSig;
5474
5475	aSig = extractFloat32Frac( a );
5476	aExp = extractFloat32Exp( a );
5477	aSign = extractFloat32Sign( a );
5478
5479	if ( aExp == 0xFF ) {
5480	return a;
5481	}
5482	aExp += n;
5483	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5484	}
5485
5486	float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5487	{
5488	flag aSign;
5489	int16 aExp;
5490	bits64 aSig;
5491
5492	aSig = extractFloat64Frac( a );
5493	aExp = extractFloat64Exp( a );
5494	aSign = extractFloat64Sign( a );
5495
5496	if ( aExp == 0x7FF ) {
5497	return a;
5498	}
5499	aExp += n;
5500	return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5501	}
5502
5503	#ifdef FLOATX80
5504	floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5505	{
5506	flag aSign;
5507	int16 aExp;
5508	bits64 aSig;
5509
5510	aSig = extractFloatx80Frac( a );
5511	aExp = extractFloatx80Exp( a );
5512	aSign = extractFloatx80Sign( a );
5513
5514	if ( aExp == 0x7FF ) {
5515	return a;
5516	}
5517	aExp += n;
5518	return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5519	aSign, aExp, aSig, 0 STATUS_VAR );
5520	}
5521	#endif
5522
5523	#ifdef FLOAT128
5524	float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5525	{
5526	flag aSign;
5527	int32 aExp;
5528	bits64 aSig0, aSig1;
5529
5530	aSig1 = extractFloat128Frac1( a );
5531	aSig0 = extractFloat128Frac0( a );
5532	aExp = extractFloat128Exp( a );
5533	aSign = extractFloat128Sign( a );
5534	if ( aExp == 0x7FFF ) {
5535	return a;
5536	}
5537	aExp += n;
5538	return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5539
5540	}
5541	#endif

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/recompiler/fpu/softfloat.c@ 36141

以其他格式下載: