VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 95410

最後變更 在這個檔案從95410是 95403,由 vboxsync 提交於 3 年 前

VMM/IEM: vxorps, vxorpd, vpxor, xorps, xorpd and various related fixes. bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 268.4 KB
 
1/* $Id: IEMAllAImplC.cpp 95403 2022-06-27 23:38:38Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18
19/*********************************************************************************************************************************
20* Header Files *
21*********************************************************************************************************************************/
22#include "IEMInternal.h"
23#include <VBox/vmm/vmcc.h>
24#include <iprt/errcore.h>
25#include <iprt/x86.h>
26#include <iprt/uint128.h>
27#include <iprt/uint256.h>
28
29RT_C_DECLS_BEGIN
30#include <softfloat.h>
31RT_C_DECLS_END
32
33
34/*********************************************************************************************************************************
35* Defined Constants And Macros *
36*********************************************************************************************************************************/
37/** @def IEM_WITHOUT_ASSEMBLY
38 * Enables all the code in this file.
39 */
40#if !defined(IEM_WITHOUT_ASSEMBLY)
41# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
42# define IEM_WITHOUT_ASSEMBLY
43# endif
44#endif
45/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
46#ifdef IEM_WITH_ASSEMBLY
47# undef IEM_WITHOUT_ASSEMBLY
48#endif
49
50/**
51 * Calculates the signed flag value given a result and it's bit width.
52 *
53 * The signed flag (SF) is a duplication of the most significant bit in the
54 * result.
55 *
56 * @returns X86_EFL_SF or 0.
57 * @param a_uResult Unsigned result value.
58 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
59 */
60#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
61 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
62
63/**
64 * Calculates the zero flag value given a result.
65 *
66 * The zero flag (ZF) indicates whether the result is zero or not.
67 *
68 * @returns X86_EFL_ZF or 0.
69 * @param a_uResult Unsigned result value.
70 */
71#define X86_EFL_CALC_ZF(a_uResult) \
72 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
73
74/**
75 * Extracts the OF flag from a OF calculation result.
76 *
77 * These are typically used by concating with a bitcount. The problem is that
78 * 8-bit values needs shifting in the other direction than the others.
79 */
80#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
81#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
82#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
83#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
84
85/**
86 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
87 *
88 * @returns Status bits.
89 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
90 * @param a_uResult Unsigned result value.
91 * @param a_uSrc The source value (for AF calc).
92 * @param a_uDst The original destination value (for AF calc).
93 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
94 * @param a_CfExpr Bool expression for the carry flag (CF).
95 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
96 */
97#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
98 do { \
99 uint32_t fEflTmp = *(a_pfEFlags); \
100 fEflTmp &= ~X86_EFL_STATUS_BITS; \
101 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
102 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
103 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
104 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
105 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
106 \
107 /* Overflow during ADDition happens when both inputs have the same signed \
108 bit value and the result has a different sign bit value. \
109 \
110 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
111 follows that for SUBtraction the signed bit value must differ between \
112 the two inputs and the result's signed bit diff from the first input. \
113 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
114 \
115 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
116 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
117 & RT_BIT_64(a_cBitsWidth - 1)) \
118 & ((a_uResult) ^ (a_uDst)) ); \
119 *(a_pfEFlags) = fEflTmp; \
120 } while (0)
121
122/**
123 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
124 *
125 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
126 * undefined. We do not set AF, as that seems to make the most sense (which
127 * probably makes it the most wrong in real life).
128 *
129 * @returns Status bits.
130 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
131 * @param a_uResult Unsigned result value.
132 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
133 * @param a_fExtra Additional bits to set.
134 */
135#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
136 do { \
137 uint32_t fEflTmp = *(a_pfEFlags); \
138 fEflTmp &= ~X86_EFL_STATUS_BITS; \
139 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
140 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
141 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
142 fEflTmp |= (a_fExtra); \
143 *(a_pfEFlags) = fEflTmp; \
144 } while (0)
145
146
147/*********************************************************************************************************************************
148* Global Variables *
149*********************************************************************************************************************************/
150/**
151 * Parity calculation table.
152 *
153 * This is also used by iemAllAImpl.asm.
154 *
155 * The generator code:
156 * @code
157 * #include <stdio.h>
158 *
159 * int main()
160 * {
161 * unsigned b;
162 * for (b = 0; b < 256; b++)
163 * {
164 * int cOnes = ( b & 1)
165 * + ((b >> 1) & 1)
166 * + ((b >> 2) & 1)
167 * + ((b >> 3) & 1)
168 * + ((b >> 4) & 1)
169 * + ((b >> 5) & 1)
170 * + ((b >> 6) & 1)
171 * + ((b >> 7) & 1);
172 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
173 * b,
174 * (b >> 7) & 1,
175 * (b >> 6) & 1,
176 * (b >> 5) & 1,
177 * (b >> 4) & 1,
178 * (b >> 3) & 1,
179 * (b >> 2) & 1,
180 * (b >> 1) & 1,
181 * b & 1,
182 * cOnes & 1 ? "0" : "X86_EFL_PF");
183 * }
184 * return 0;
185 * }
186 * @endcode
187 */
188uint8_t const g_afParity[256] =
189{
190 /* 0000 = 00000000b */ X86_EFL_PF,
191 /* 0x01 = 00000001b */ 0,
192 /* 0x02 = 00000010b */ 0,
193 /* 0x03 = 00000011b */ X86_EFL_PF,
194 /* 0x04 = 00000100b */ 0,
195 /* 0x05 = 00000101b */ X86_EFL_PF,
196 /* 0x06 = 00000110b */ X86_EFL_PF,
197 /* 0x07 = 00000111b */ 0,
198 /* 0x08 = 00001000b */ 0,
199 /* 0x09 = 00001001b */ X86_EFL_PF,
200 /* 0x0a = 00001010b */ X86_EFL_PF,
201 /* 0x0b = 00001011b */ 0,
202 /* 0x0c = 00001100b */ X86_EFL_PF,
203 /* 0x0d = 00001101b */ 0,
204 /* 0x0e = 00001110b */ 0,
205 /* 0x0f = 00001111b */ X86_EFL_PF,
206 /* 0x10 = 00010000b */ 0,
207 /* 0x11 = 00010001b */ X86_EFL_PF,
208 /* 0x12 = 00010010b */ X86_EFL_PF,
209 /* 0x13 = 00010011b */ 0,
210 /* 0x14 = 00010100b */ X86_EFL_PF,
211 /* 0x15 = 00010101b */ 0,
212 /* 0x16 = 00010110b */ 0,
213 /* 0x17 = 00010111b */ X86_EFL_PF,
214 /* 0x18 = 00011000b */ X86_EFL_PF,
215 /* 0x19 = 00011001b */ 0,
216 /* 0x1a = 00011010b */ 0,
217 /* 0x1b = 00011011b */ X86_EFL_PF,
218 /* 0x1c = 00011100b */ 0,
219 /* 0x1d = 00011101b */ X86_EFL_PF,
220 /* 0x1e = 00011110b */ X86_EFL_PF,
221 /* 0x1f = 00011111b */ 0,
222 /* 0x20 = 00100000b */ 0,
223 /* 0x21 = 00100001b */ X86_EFL_PF,
224 /* 0x22 = 00100010b */ X86_EFL_PF,
225 /* 0x23 = 00100011b */ 0,
226 /* 0x24 = 00100100b */ X86_EFL_PF,
227 /* 0x25 = 00100101b */ 0,
228 /* 0x26 = 00100110b */ 0,
229 /* 0x27 = 00100111b */ X86_EFL_PF,
230 /* 0x28 = 00101000b */ X86_EFL_PF,
231 /* 0x29 = 00101001b */ 0,
232 /* 0x2a = 00101010b */ 0,
233 /* 0x2b = 00101011b */ X86_EFL_PF,
234 /* 0x2c = 00101100b */ 0,
235 /* 0x2d = 00101101b */ X86_EFL_PF,
236 /* 0x2e = 00101110b */ X86_EFL_PF,
237 /* 0x2f = 00101111b */ 0,
238 /* 0x30 = 00110000b */ X86_EFL_PF,
239 /* 0x31 = 00110001b */ 0,
240 /* 0x32 = 00110010b */ 0,
241 /* 0x33 = 00110011b */ X86_EFL_PF,
242 /* 0x34 = 00110100b */ 0,
243 /* 0x35 = 00110101b */ X86_EFL_PF,
244 /* 0x36 = 00110110b */ X86_EFL_PF,
245 /* 0x37 = 00110111b */ 0,
246 /* 0x38 = 00111000b */ 0,
247 /* 0x39 = 00111001b */ X86_EFL_PF,
248 /* 0x3a = 00111010b */ X86_EFL_PF,
249 /* 0x3b = 00111011b */ 0,
250 /* 0x3c = 00111100b */ X86_EFL_PF,
251 /* 0x3d = 00111101b */ 0,
252 /* 0x3e = 00111110b */ 0,
253 /* 0x3f = 00111111b */ X86_EFL_PF,
254 /* 0x40 = 01000000b */ 0,
255 /* 0x41 = 01000001b */ X86_EFL_PF,
256 /* 0x42 = 01000010b */ X86_EFL_PF,
257 /* 0x43 = 01000011b */ 0,
258 /* 0x44 = 01000100b */ X86_EFL_PF,
259 /* 0x45 = 01000101b */ 0,
260 /* 0x46 = 01000110b */ 0,
261 /* 0x47 = 01000111b */ X86_EFL_PF,
262 /* 0x48 = 01001000b */ X86_EFL_PF,
263 /* 0x49 = 01001001b */ 0,
264 /* 0x4a = 01001010b */ 0,
265 /* 0x4b = 01001011b */ X86_EFL_PF,
266 /* 0x4c = 01001100b */ 0,
267 /* 0x4d = 01001101b */ X86_EFL_PF,
268 /* 0x4e = 01001110b */ X86_EFL_PF,
269 /* 0x4f = 01001111b */ 0,
270 /* 0x50 = 01010000b */ X86_EFL_PF,
271 /* 0x51 = 01010001b */ 0,
272 /* 0x52 = 01010010b */ 0,
273 /* 0x53 = 01010011b */ X86_EFL_PF,
274 /* 0x54 = 01010100b */ 0,
275 /* 0x55 = 01010101b */ X86_EFL_PF,
276 /* 0x56 = 01010110b */ X86_EFL_PF,
277 /* 0x57 = 01010111b */ 0,
278 /* 0x58 = 01011000b */ 0,
279 /* 0x59 = 01011001b */ X86_EFL_PF,
280 /* 0x5a = 01011010b */ X86_EFL_PF,
281 /* 0x5b = 01011011b */ 0,
282 /* 0x5c = 01011100b */ X86_EFL_PF,
283 /* 0x5d = 01011101b */ 0,
284 /* 0x5e = 01011110b */ 0,
285 /* 0x5f = 01011111b */ X86_EFL_PF,
286 /* 0x60 = 01100000b */ X86_EFL_PF,
287 /* 0x61 = 01100001b */ 0,
288 /* 0x62 = 01100010b */ 0,
289 /* 0x63 = 01100011b */ X86_EFL_PF,
290 /* 0x64 = 01100100b */ 0,
291 /* 0x65 = 01100101b */ X86_EFL_PF,
292 /* 0x66 = 01100110b */ X86_EFL_PF,
293 /* 0x67 = 01100111b */ 0,
294 /* 0x68 = 01101000b */ 0,
295 /* 0x69 = 01101001b */ X86_EFL_PF,
296 /* 0x6a = 01101010b */ X86_EFL_PF,
297 /* 0x6b = 01101011b */ 0,
298 /* 0x6c = 01101100b */ X86_EFL_PF,
299 /* 0x6d = 01101101b */ 0,
300 /* 0x6e = 01101110b */ 0,
301 /* 0x6f = 01101111b */ X86_EFL_PF,
302 /* 0x70 = 01110000b */ 0,
303 /* 0x71 = 01110001b */ X86_EFL_PF,
304 /* 0x72 = 01110010b */ X86_EFL_PF,
305 /* 0x73 = 01110011b */ 0,
306 /* 0x74 = 01110100b */ X86_EFL_PF,
307 /* 0x75 = 01110101b */ 0,
308 /* 0x76 = 01110110b */ 0,
309 /* 0x77 = 01110111b */ X86_EFL_PF,
310 /* 0x78 = 01111000b */ X86_EFL_PF,
311 /* 0x79 = 01111001b */ 0,
312 /* 0x7a = 01111010b */ 0,
313 /* 0x7b = 01111011b */ X86_EFL_PF,
314 /* 0x7c = 01111100b */ 0,
315 /* 0x7d = 01111101b */ X86_EFL_PF,
316 /* 0x7e = 01111110b */ X86_EFL_PF,
317 /* 0x7f = 01111111b */ 0,
318 /* 0x80 = 10000000b */ 0,
319 /* 0x81 = 10000001b */ X86_EFL_PF,
320 /* 0x82 = 10000010b */ X86_EFL_PF,
321 /* 0x83 = 10000011b */ 0,
322 /* 0x84 = 10000100b */ X86_EFL_PF,
323 /* 0x85 = 10000101b */ 0,
324 /* 0x86 = 10000110b */ 0,
325 /* 0x87 = 10000111b */ X86_EFL_PF,
326 /* 0x88 = 10001000b */ X86_EFL_PF,
327 /* 0x89 = 10001001b */ 0,
328 /* 0x8a = 10001010b */ 0,
329 /* 0x8b = 10001011b */ X86_EFL_PF,
330 /* 0x8c = 10001100b */ 0,
331 /* 0x8d = 10001101b */ X86_EFL_PF,
332 /* 0x8e = 10001110b */ X86_EFL_PF,
333 /* 0x8f = 10001111b */ 0,
334 /* 0x90 = 10010000b */ X86_EFL_PF,
335 /* 0x91 = 10010001b */ 0,
336 /* 0x92 = 10010010b */ 0,
337 /* 0x93 = 10010011b */ X86_EFL_PF,
338 /* 0x94 = 10010100b */ 0,
339 /* 0x95 = 10010101b */ X86_EFL_PF,
340 /* 0x96 = 10010110b */ X86_EFL_PF,
341 /* 0x97 = 10010111b */ 0,
342 /* 0x98 = 10011000b */ 0,
343 /* 0x99 = 10011001b */ X86_EFL_PF,
344 /* 0x9a = 10011010b */ X86_EFL_PF,
345 /* 0x9b = 10011011b */ 0,
346 /* 0x9c = 10011100b */ X86_EFL_PF,
347 /* 0x9d = 10011101b */ 0,
348 /* 0x9e = 10011110b */ 0,
349 /* 0x9f = 10011111b */ X86_EFL_PF,
350 /* 0xa0 = 10100000b */ X86_EFL_PF,
351 /* 0xa1 = 10100001b */ 0,
352 /* 0xa2 = 10100010b */ 0,
353 /* 0xa3 = 10100011b */ X86_EFL_PF,
354 /* 0xa4 = 10100100b */ 0,
355 /* 0xa5 = 10100101b */ X86_EFL_PF,
356 /* 0xa6 = 10100110b */ X86_EFL_PF,
357 /* 0xa7 = 10100111b */ 0,
358 /* 0xa8 = 10101000b */ 0,
359 /* 0xa9 = 10101001b */ X86_EFL_PF,
360 /* 0xaa = 10101010b */ X86_EFL_PF,
361 /* 0xab = 10101011b */ 0,
362 /* 0xac = 10101100b */ X86_EFL_PF,
363 /* 0xad = 10101101b */ 0,
364 /* 0xae = 10101110b */ 0,
365 /* 0xaf = 10101111b */ X86_EFL_PF,
366 /* 0xb0 = 10110000b */ 0,
367 /* 0xb1 = 10110001b */ X86_EFL_PF,
368 /* 0xb2 = 10110010b */ X86_EFL_PF,
369 /* 0xb3 = 10110011b */ 0,
370 /* 0xb4 = 10110100b */ X86_EFL_PF,
371 /* 0xb5 = 10110101b */ 0,
372 /* 0xb6 = 10110110b */ 0,
373 /* 0xb7 = 10110111b */ X86_EFL_PF,
374 /* 0xb8 = 10111000b */ X86_EFL_PF,
375 /* 0xb9 = 10111001b */ 0,
376 /* 0xba = 10111010b */ 0,
377 /* 0xbb = 10111011b */ X86_EFL_PF,
378 /* 0xbc = 10111100b */ 0,
379 /* 0xbd = 10111101b */ X86_EFL_PF,
380 /* 0xbe = 10111110b */ X86_EFL_PF,
381 /* 0xbf = 10111111b */ 0,
382 /* 0xc0 = 11000000b */ X86_EFL_PF,
383 /* 0xc1 = 11000001b */ 0,
384 /* 0xc2 = 11000010b */ 0,
385 /* 0xc3 = 11000011b */ X86_EFL_PF,
386 /* 0xc4 = 11000100b */ 0,
387 /* 0xc5 = 11000101b */ X86_EFL_PF,
388 /* 0xc6 = 11000110b */ X86_EFL_PF,
389 /* 0xc7 = 11000111b */ 0,
390 /* 0xc8 = 11001000b */ 0,
391 /* 0xc9 = 11001001b */ X86_EFL_PF,
392 /* 0xca = 11001010b */ X86_EFL_PF,
393 /* 0xcb = 11001011b */ 0,
394 /* 0xcc = 11001100b */ X86_EFL_PF,
395 /* 0xcd = 11001101b */ 0,
396 /* 0xce = 11001110b */ 0,
397 /* 0xcf = 11001111b */ X86_EFL_PF,
398 /* 0xd0 = 11010000b */ 0,
399 /* 0xd1 = 11010001b */ X86_EFL_PF,
400 /* 0xd2 = 11010010b */ X86_EFL_PF,
401 /* 0xd3 = 11010011b */ 0,
402 /* 0xd4 = 11010100b */ X86_EFL_PF,
403 /* 0xd5 = 11010101b */ 0,
404 /* 0xd6 = 11010110b */ 0,
405 /* 0xd7 = 11010111b */ X86_EFL_PF,
406 /* 0xd8 = 11011000b */ X86_EFL_PF,
407 /* 0xd9 = 11011001b */ 0,
408 /* 0xda = 11011010b */ 0,
409 /* 0xdb = 11011011b */ X86_EFL_PF,
410 /* 0xdc = 11011100b */ 0,
411 /* 0xdd = 11011101b */ X86_EFL_PF,
412 /* 0xde = 11011110b */ X86_EFL_PF,
413 /* 0xdf = 11011111b */ 0,
414 /* 0xe0 = 11100000b */ 0,
415 /* 0xe1 = 11100001b */ X86_EFL_PF,
416 /* 0xe2 = 11100010b */ X86_EFL_PF,
417 /* 0xe3 = 11100011b */ 0,
418 /* 0xe4 = 11100100b */ X86_EFL_PF,
419 /* 0xe5 = 11100101b */ 0,
420 /* 0xe6 = 11100110b */ 0,
421 /* 0xe7 = 11100111b */ X86_EFL_PF,
422 /* 0xe8 = 11101000b */ X86_EFL_PF,
423 /* 0xe9 = 11101001b */ 0,
424 /* 0xea = 11101010b */ 0,
425 /* 0xeb = 11101011b */ X86_EFL_PF,
426 /* 0xec = 11101100b */ 0,
427 /* 0xed = 11101101b */ X86_EFL_PF,
428 /* 0xee = 11101110b */ X86_EFL_PF,
429 /* 0xef = 11101111b */ 0,
430 /* 0xf0 = 11110000b */ X86_EFL_PF,
431 /* 0xf1 = 11110001b */ 0,
432 /* 0xf2 = 11110010b */ 0,
433 /* 0xf3 = 11110011b */ X86_EFL_PF,
434 /* 0xf4 = 11110100b */ 0,
435 /* 0xf5 = 11110101b */ X86_EFL_PF,
436 /* 0xf6 = 11110110b */ X86_EFL_PF,
437 /* 0xf7 = 11110111b */ 0,
438 /* 0xf8 = 11111000b */ 0,
439 /* 0xf9 = 11111001b */ X86_EFL_PF,
440 /* 0xfa = 11111010b */ X86_EFL_PF,
441 /* 0xfb = 11111011b */ 0,
442 /* 0xfc = 11111100b */ X86_EFL_PF,
443 /* 0xfd = 11111101b */ 0,
444 /* 0xfe = 11111110b */ 0,
445 /* 0xff = 11111111b */ X86_EFL_PF,
446};
447
448/* for clang: */
449extern const RTFLOAT80U g_ar80Zero[];
450extern const RTFLOAT80U g_ar80One[];
451extern const RTFLOAT80U g_r80Indefinite;
452extern const RTFLOAT80U g_ar80Infinity[];
453extern const RTFLOAT128U g_r128Ln2;
454extern const RTUINT128U g_u128Ln2Mantissa;
455extern const RTUINT128U g_u128Ln2MantissaIntel;
456extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
457
458/** Zero values (indexed by fSign). */
459RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
460
461/** One values (indexed by fSign). */
462RTFLOAT80U const g_ar80One[] =
463{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
464
465/** Indefinite (negative). */
466RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
467
468/** Infinities (indexed by fSign). */
469RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
470
471#if 0
472/** 128-bit floating point constant: 2.0 */
473const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
474#endif
475
476
477/* The next section is generated by tools/IEMGenFpuConstants: */
478
479/** The ln2 constant as 128-bit floating point value.
480 * base-10: 6.93147180559945309417232121458176575e-1
481 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
482 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
483//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
484const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
485/** High precision ln2 value.
486 * base-10: 6.931471805599453094172321214581765680747e-1
487 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
488 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
489const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
490/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
491 * base-10: 6.931471805599453094151379470289064954613e-1
492 * base-16: b.17217f7d1cf79abc0000000000000000@-1
493 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
494const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
495
496/** Horner constants for f2xm1 */
497const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
498{
499 /* a0
500 * base-10: 1.00000000000000000000000000000000000e0
501 * base-16: 1.0000000000000000000000000000@0
502 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
503 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
504 /* a1
505 * base-10: 5.00000000000000000000000000000000000e-1
506 * base-16: 8.0000000000000000000000000000@-1
507 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
508 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
509 /* a2
510 * base-10: 1.66666666666666666666666666666666658e-1
511 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
512 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
513 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
514 /* a3
515 * base-10: 4.16666666666666666666666666666666646e-2
516 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
517 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
518 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
519 /* a4
520 * base-10: 8.33333333333333333333333333333333323e-3
521 * base-16: 2.2222222222222222222222222222@-2
522 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
523 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
524 /* a5
525 * base-10: 1.38888888888888888888888888888888874e-3
526 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
527 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
528 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
529 /* a6
530 * base-10: 1.98412698412698412698412698412698412e-4
531 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
532 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
533 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
534 /* a7
535 * base-10: 2.48015873015873015873015873015873015e-5
536 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
537 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
538 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
539 /* a8
540 * base-10: 2.75573192239858906525573192239858902e-6
541 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
542 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
543 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
544 /* a9
545 * base-10: 2.75573192239858906525573192239858865e-7
546 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
547 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
548 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
549 /* a10
550 * base-10: 2.50521083854417187750521083854417184e-8
551 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
552 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
553 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
554 /* a11
555 * base-10: 2.08767569878680989792100903212014296e-9
556 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
557 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
558 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
559 /* a12
560 * base-10: 1.60590438368216145993923771701549472e-10
561 * base-16: b.092309d43684be51c198e91d7b40@-9
562 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
563 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
564 /* a13
565 * base-10: 1.14707455977297247138516979786821043e-11
566 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
567 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
568 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
569 /* a14
570 * base-10: 7.64716373181981647590113198578806964e-13
571 * base-16: d.73f9f399dc0f88ec32b587746578@-11
572 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
573 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
574 /* a15
575 * base-10: 4.77947733238738529743820749111754352e-14
576 * base-16: d.73f9f399dc0f88ec32b587746578@-12
577 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
578 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
579 /* a16
580 * base-10: 2.81145725434552076319894558301031970e-15
581 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
582 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
583 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
584 /* a17
585 * base-10: 1.56192069685862264622163643500573321e-16
586 * base-16: b.413c31dcbecbbdd8024435161550@-14
587 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
588 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
589 /* a18
590 * base-10: 8.22063524662432971695598123687227980e-18
591 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
592 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
593 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
594 /* a19
595 * base-10: 4.11031762331216485847799061843614006e-19
596 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
597 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
598 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
599 /* a20
600 * base-10: 7.04351638180413298434020229233492164e-20
601 * base-16: 1.4c9ee35db1d1f3c946fdcd48fd88@-16
602 * base-2 : 1.0100110010011110111000110101110110110001110100011111001111001001010001101111110111001101010010001111110110001000e-64 */
603 RTFLOAT128U_INIT_C(0, 0x4c9ee35db1d1, 0xf3c946fdcd48fd88, 0x3fbf),
604 /* a21
605 * base-10: 5.81527769640186708776361513365257702e-20
606 * base-16: 1.129e64bff606a2b9c9fc624481cd@-16
607 * base-2 : 1.0001001010011110011001001011111111110110000001101010001010111001110010011111110001100010010001001000000111001101e-64 */
608 RTFLOAT128U_INIT_C(0, 0x129e64bff606, 0xa2b9c9fc624481cd, 0x3fbf),
609};
610
611
612/*
613 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
614 * it all in C is probably safer atm., optimize what's necessary later, maybe.
615 */
616#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
617
618
619/*********************************************************************************************************************************
620* Binary Operations *
621*********************************************************************************************************************************/
622
623/*
624 * ADD
625 */
626
627IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
628{
629 uint64_t uDst = *puDst;
630 uint64_t uResult = uDst + uSrc;
631 *puDst = uResult;
632 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
633}
634
635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
636
637IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
638{
639 uint32_t uDst = *puDst;
640 uint32_t uResult = uDst + uSrc;
641 *puDst = uResult;
642 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
643}
644
645
646IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
647{
648 uint16_t uDst = *puDst;
649 uint16_t uResult = uDst + uSrc;
650 *puDst = uResult;
651 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
652}
653
654
655IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
656{
657 uint8_t uDst = *puDst;
658 uint8_t uResult = uDst + uSrc;
659 *puDst = uResult;
660 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
661}
662
663# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
664
665/*
666 * ADC
667 */
668
669IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
670{
671 if (!(*pfEFlags & X86_EFL_CF))
672 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
673 else
674 {
675 uint64_t uDst = *puDst;
676 uint64_t uResult = uDst + uSrc + 1;
677 *puDst = uResult;
678 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
679 }
680}
681
682# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
683
684IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
685{
686 if (!(*pfEFlags & X86_EFL_CF))
687 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
688 else
689 {
690 uint32_t uDst = *puDst;
691 uint32_t uResult = uDst + uSrc + 1;
692 *puDst = uResult;
693 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
694 }
695}
696
697
698IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
699{
700 if (!(*pfEFlags & X86_EFL_CF))
701 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
702 else
703 {
704 uint16_t uDst = *puDst;
705 uint16_t uResult = uDst + uSrc + 1;
706 *puDst = uResult;
707 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
708 }
709}
710
711
712IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
713{
714 if (!(*pfEFlags & X86_EFL_CF))
715 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
716 else
717 {
718 uint8_t uDst = *puDst;
719 uint8_t uResult = uDst + uSrc + 1;
720 *puDst = uResult;
721 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
722 }
723}
724
725# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
726
727/*
728 * SUB
729 */
730
731IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
732{
733 uint64_t uDst = *puDst;
734 uint64_t uResult = uDst - uSrc;
735 *puDst = uResult;
736 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
737}
738
739# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
740
741IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
742{
743 uint32_t uDst = *puDst;
744 uint32_t uResult = uDst - uSrc;
745 *puDst = uResult;
746 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
747}
748
749
750IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
751{
752 uint16_t uDst = *puDst;
753 uint16_t uResult = uDst - uSrc;
754 *puDst = uResult;
755 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
756}
757
758
759IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
760{
761 uint8_t uDst = *puDst;
762 uint8_t uResult = uDst - uSrc;
763 *puDst = uResult;
764 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
765}
766
767# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
768
769/*
770 * SBB
771 */
772
773IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
774{
775 if (!(*pfEFlags & X86_EFL_CF))
776 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
777 else
778 {
779 uint64_t uDst = *puDst;
780 uint64_t uResult = uDst - uSrc - 1;
781 *puDst = uResult;
782 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
783 }
784}
785
786# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
787
788IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
789{
790 if (!(*pfEFlags & X86_EFL_CF))
791 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
792 else
793 {
794 uint32_t uDst = *puDst;
795 uint32_t uResult = uDst - uSrc - 1;
796 *puDst = uResult;
797 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
798 }
799}
800
801
802IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
803{
804 if (!(*pfEFlags & X86_EFL_CF))
805 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
806 else
807 {
808 uint16_t uDst = *puDst;
809 uint16_t uResult = uDst - uSrc - 1;
810 *puDst = uResult;
811 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
812 }
813}
814
815
816IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
817{
818 if (!(*pfEFlags & X86_EFL_CF))
819 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
820 else
821 {
822 uint8_t uDst = *puDst;
823 uint8_t uResult = uDst - uSrc - 1;
824 *puDst = uResult;
825 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
826 }
827}
828
829# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
830
831
832/*
833 * OR
834 */
835
836IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
837{
838 uint64_t uResult = *puDst | uSrc;
839 *puDst = uResult;
840 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
841}
842
843# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
844
845IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
846{
847 uint32_t uResult = *puDst | uSrc;
848 *puDst = uResult;
849 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
850}
851
852
853IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
854{
855 uint16_t uResult = *puDst | uSrc;
856 *puDst = uResult;
857 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
858}
859
860
861IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
862{
863 uint8_t uResult = *puDst | uSrc;
864 *puDst = uResult;
865 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
866}
867
868# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
869
870/*
871 * XOR
872 */
873
874IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
875{
876 uint64_t uResult = *puDst ^ uSrc;
877 *puDst = uResult;
878 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
879}
880
881# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
882
883IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
884{
885 uint32_t uResult = *puDst ^ uSrc;
886 *puDst = uResult;
887 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
888}
889
890
891IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
892{
893 uint16_t uResult = *puDst ^ uSrc;
894 *puDst = uResult;
895 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
896}
897
898
899IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
900{
901 uint8_t uResult = *puDst ^ uSrc;
902 *puDst = uResult;
903 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
904}
905
906# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
907
908/*
909 * AND
910 */
911
912IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
913{
914 uint64_t const uResult = *puDst & uSrc;
915 *puDst = uResult;
916 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
917}
918
919# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
920
921IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
922{
923 uint32_t const uResult = *puDst & uSrc;
924 *puDst = uResult;
925 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
926}
927
928
929IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
930{
931 uint16_t const uResult = *puDst & uSrc;
932 *puDst = uResult;
933 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
934}
935
936
937IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
938{
939 uint8_t const uResult = *puDst & uSrc;
940 *puDst = uResult;
941 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
942}
943
944# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
945#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
946
947/*
948 * ANDN (BMI1 instruction)
949 */
950
951IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
952{
953 uint64_t const uResult = ~uSrc1 & uSrc2;
954 *puDst = uResult;
955 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
956}
957
958
959IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
960{
961 uint32_t const uResult = ~uSrc1 & uSrc2;
962 *puDst = uResult;
963 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
964}
965
966
967#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
968IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
969{
970 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
971}
972#endif
973
974
975#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
976IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
977{
978 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
979}
980#endif
981
982#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
983
984/*
985 * CMP
986 */
987
988IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
989{
990 uint64_t uDstTmp = *puDst;
991 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
992}
993
994# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
995
996IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
997{
998 uint32_t uDstTmp = *puDst;
999 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1000}
1001
1002
1003IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1004{
1005 uint16_t uDstTmp = *puDst;
1006 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1007}
1008
1009
1010IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1011{
1012 uint8_t uDstTmp = *puDst;
1013 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1014}
1015
1016# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1017
1018/*
1019 * TEST
1020 */
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint64_t uResult = *puDst & uSrc;
1025 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1026}
1027
1028# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1029
1030IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1031{
1032 uint32_t uResult = *puDst & uSrc;
1033 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1034}
1035
1036
1037IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1038{
1039 uint16_t uResult = *puDst & uSrc;
1040 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1041}
1042
1043
1044IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1045{
1046 uint8_t uResult = *puDst & uSrc;
1047 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1048}
1049
1050# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1051
1052
1053/*
1054 * LOCK prefixed variants of the above
1055 */
1056
1057/** 64-bit locked binary operand operation. */
1058# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1059 do { \
1060 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1061 uint ## a_cBitsWidth ## _t uTmp; \
1062 uint32_t fEflTmp; \
1063 do \
1064 { \
1065 uTmp = uOld; \
1066 fEflTmp = *pfEFlags; \
1067 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1068 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1069 *pfEFlags = fEflTmp; \
1070 } while (0)
1071
1072
1073#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1074 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1075 uint ## a_cBitsWidth ## _t uSrc, \
1076 uint32_t *pfEFlags)) \
1077 { \
1078 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1079 }
1080
1081EMIT_LOCKED_BIN_OP(add, 64)
1082EMIT_LOCKED_BIN_OP(adc, 64)
1083EMIT_LOCKED_BIN_OP(sub, 64)
1084EMIT_LOCKED_BIN_OP(sbb, 64)
1085EMIT_LOCKED_BIN_OP(or, 64)
1086EMIT_LOCKED_BIN_OP(xor, 64)
1087EMIT_LOCKED_BIN_OP(and, 64)
1088# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1089EMIT_LOCKED_BIN_OP(add, 32)
1090EMIT_LOCKED_BIN_OP(adc, 32)
1091EMIT_LOCKED_BIN_OP(sub, 32)
1092EMIT_LOCKED_BIN_OP(sbb, 32)
1093EMIT_LOCKED_BIN_OP(or, 32)
1094EMIT_LOCKED_BIN_OP(xor, 32)
1095EMIT_LOCKED_BIN_OP(and, 32)
1096
1097EMIT_LOCKED_BIN_OP(add, 16)
1098EMIT_LOCKED_BIN_OP(adc, 16)
1099EMIT_LOCKED_BIN_OP(sub, 16)
1100EMIT_LOCKED_BIN_OP(sbb, 16)
1101EMIT_LOCKED_BIN_OP(or, 16)
1102EMIT_LOCKED_BIN_OP(xor, 16)
1103EMIT_LOCKED_BIN_OP(and, 16)
1104
1105EMIT_LOCKED_BIN_OP(add, 8)
1106EMIT_LOCKED_BIN_OP(adc, 8)
1107EMIT_LOCKED_BIN_OP(sub, 8)
1108EMIT_LOCKED_BIN_OP(sbb, 8)
1109EMIT_LOCKED_BIN_OP(or, 8)
1110EMIT_LOCKED_BIN_OP(xor, 8)
1111EMIT_LOCKED_BIN_OP(and, 8)
1112# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1113
1114
1115/*
1116 * Bit operations (same signature as above).
1117 */
1118
1119/*
1120 * BT
1121 */
1122
1123IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1124{
1125 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1126 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1127 Assert(uSrc < 64);
1128 uint64_t uDst = *puDst;
1129 if (uDst & RT_BIT_64(uSrc))
1130 *pfEFlags |= X86_EFL_CF;
1131 else
1132 *pfEFlags &= ~X86_EFL_CF;
1133}
1134
1135# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1136
1137IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1138{
1139 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1140 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1141 Assert(uSrc < 32);
1142 uint32_t uDst = *puDst;
1143 if (uDst & RT_BIT_32(uSrc))
1144 *pfEFlags |= X86_EFL_CF;
1145 else
1146 *pfEFlags &= ~X86_EFL_CF;
1147}
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 16);
1154 uint16_t uDst = *puDst;
1155 if (uDst & RT_BIT_32(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1162
1163/*
1164 * BTC
1165 */
1166
1167IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1168{
1169 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1170 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1171 Assert(uSrc < 64);
1172 uint64_t fMask = RT_BIT_64(uSrc);
1173 uint64_t uDst = *puDst;
1174 if (uDst & fMask)
1175 {
1176 uDst &= ~fMask;
1177 *puDst = uDst;
1178 *pfEFlags |= X86_EFL_CF;
1179 }
1180 else
1181 {
1182 uDst |= fMask;
1183 *puDst = uDst;
1184 *pfEFlags &= ~X86_EFL_CF;
1185 }
1186}
1187
1188# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1189
1190IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1191{
1192 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1193 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1194 Assert(uSrc < 32);
1195 uint32_t fMask = RT_BIT_32(uSrc);
1196 uint32_t uDst = *puDst;
1197 if (uDst & fMask)
1198 {
1199 uDst &= ~fMask;
1200 *puDst = uDst;
1201 *pfEFlags |= X86_EFL_CF;
1202 }
1203 else
1204 {
1205 uDst |= fMask;
1206 *puDst = uDst;
1207 *pfEFlags &= ~X86_EFL_CF;
1208 }
1209}
1210
1211
1212IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1213{
1214 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1215 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1216 Assert(uSrc < 16);
1217 uint16_t fMask = RT_BIT_32(uSrc);
1218 uint16_t uDst = *puDst;
1219 if (uDst & fMask)
1220 {
1221 uDst &= ~fMask;
1222 *puDst = uDst;
1223 *pfEFlags |= X86_EFL_CF;
1224 }
1225 else
1226 {
1227 uDst |= fMask;
1228 *puDst = uDst;
1229 *pfEFlags &= ~X86_EFL_CF;
1230 }
1231}
1232
1233# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1234
1235/*
1236 * BTR
1237 */
1238
1239IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1240{
1241 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1242 logical operation (AND/OR/whatever). */
1243 Assert(uSrc < 64);
1244 uint64_t fMask = RT_BIT_64(uSrc);
1245 uint64_t uDst = *puDst;
1246 if (uDst & fMask)
1247 {
1248 uDst &= ~fMask;
1249 *puDst = uDst;
1250 *pfEFlags |= X86_EFL_CF;
1251 }
1252 else
1253 *pfEFlags &= ~X86_EFL_CF;
1254}
1255
1256# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1257
1258IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1259{
1260 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1261 logical operation (AND/OR/whatever). */
1262 Assert(uSrc < 32);
1263 uint32_t fMask = RT_BIT_32(uSrc);
1264 uint32_t uDst = *puDst;
1265 if (uDst & fMask)
1266 {
1267 uDst &= ~fMask;
1268 *puDst = uDst;
1269 *pfEFlags |= X86_EFL_CF;
1270 }
1271 else
1272 *pfEFlags &= ~X86_EFL_CF;
1273}
1274
1275
1276IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1277{
1278 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1279 logical operation (AND/OR/whatever). */
1280 Assert(uSrc < 16);
1281 uint16_t fMask = RT_BIT_32(uSrc);
1282 uint16_t uDst = *puDst;
1283 if (uDst & fMask)
1284 {
1285 uDst &= ~fMask;
1286 *puDst = uDst;
1287 *pfEFlags |= X86_EFL_CF;
1288 }
1289 else
1290 *pfEFlags &= ~X86_EFL_CF;
1291}
1292
1293# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1294
1295/*
1296 * BTS
1297 */
1298
1299IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1300{
1301 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1302 logical operation (AND/OR/whatever). */
1303 Assert(uSrc < 64);
1304 uint64_t fMask = RT_BIT_64(uSrc);
1305 uint64_t uDst = *puDst;
1306 if (uDst & fMask)
1307 *pfEFlags |= X86_EFL_CF;
1308 else
1309 {
1310 uDst |= fMask;
1311 *puDst = uDst;
1312 *pfEFlags &= ~X86_EFL_CF;
1313 }
1314}
1315
1316# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1317
1318IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1319{
1320 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1321 logical operation (AND/OR/whatever). */
1322 Assert(uSrc < 32);
1323 uint32_t fMask = RT_BIT_32(uSrc);
1324 uint32_t uDst = *puDst;
1325 if (uDst & fMask)
1326 *pfEFlags |= X86_EFL_CF;
1327 else
1328 {
1329 uDst |= fMask;
1330 *puDst = uDst;
1331 *pfEFlags &= ~X86_EFL_CF;
1332 }
1333}
1334
1335
1336IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1337{
1338 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1339 logical operation (AND/OR/whatever). */
1340 Assert(uSrc < 16);
1341 uint16_t fMask = RT_BIT_32(uSrc);
1342 uint32_t uDst = *puDst;
1343 if (uDst & fMask)
1344 *pfEFlags |= X86_EFL_CF;
1345 else
1346 {
1347 uDst |= fMask;
1348 *puDst = uDst;
1349 *pfEFlags &= ~X86_EFL_CF;
1350 }
1351}
1352
1353# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1354
1355
1356EMIT_LOCKED_BIN_OP(btc, 64)
1357EMIT_LOCKED_BIN_OP(btr, 64)
1358EMIT_LOCKED_BIN_OP(bts, 64)
1359# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1360EMIT_LOCKED_BIN_OP(btc, 32)
1361EMIT_LOCKED_BIN_OP(btr, 32)
1362EMIT_LOCKED_BIN_OP(bts, 32)
1363
1364EMIT_LOCKED_BIN_OP(btc, 16)
1365EMIT_LOCKED_BIN_OP(btr, 16)
1366EMIT_LOCKED_BIN_OP(bts, 16)
1367# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1368
1369
1370/*
1371 * Helpers for BSR and BSF.
1372 *
1373 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1374 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1375 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1376 * but we restrict ourselves to emulating these recent marchs.
1377 */
1378#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1379 unsigned iBit = (a_iBit); \
1380 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1381 if (iBit) \
1382 { \
1383 *puDst = --iBit; \
1384 fEfl |= g_afParity[iBit]; \
1385 } \
1386 else \
1387 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1388 *pfEFlags = fEfl; \
1389 } while (0)
1390#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1391 unsigned const iBit = (a_iBit); \
1392 if (iBit) \
1393 { \
1394 *puDst = iBit - 1; \
1395 *pfEFlags &= ~X86_EFL_ZF; \
1396 } \
1397 else \
1398 *pfEFlags |= X86_EFL_ZF; \
1399 } while (0)
1400
1401
1402/*
1403 * BSF - first (least significant) bit set
1404 */
1405IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1406{
1407 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1408}
1409
1410IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1411{
1412 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1413}
1414
1415IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1416{
1417 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1418}
1419
1420# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1421
1422IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1423{
1424 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1425}
1426
1427IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1428{
1429 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1430}
1431
1432IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1433{
1434 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1435}
1436
1437
1438IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1439{
1440 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1441}
1442
1443IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1444{
1445 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1446}
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1451}
1452
1453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1454
1455
1456/*
1457 * BSR - last (most significant) bit set
1458 */
1459IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1460{
1461 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1462}
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1472}
1473
1474# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1475
1476IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1477{
1478 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1479}
1480
1481IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1482{
1483 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1484}
1485
1486IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1487{
1488 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1489}
1490
1491
1492IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1493{
1494 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1495}
1496
1497IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1498{
1499 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1500}
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1505}
1506
1507# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1508
1509
1510/*
1511 * Helpers for LZCNT and TZCNT.
1512 */
1513#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1514 unsigned const uResult = (a_uResult); \
1515 *(a_puDst) = uResult; \
1516 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1517 if (uResult) \
1518 fEfl |= g_afParity[uResult]; \
1519 else \
1520 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1521 if (!a_uSrc) \
1522 fEfl |= X86_EFL_CF; \
1523 *(a_pfEFlags) = fEfl; \
1524 } while (0)
1525#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1526 unsigned const uResult = (a_uResult); \
1527 *(a_puDst) = uResult; \
1528 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1529 if (!uResult) \
1530 fEfl |= X86_EFL_ZF; \
1531 if (!a_uSrc) \
1532 fEfl |= X86_EFL_CF; \
1533 *(a_pfEFlags) = fEfl; \
1534 } while (0)
1535
1536
1537/*
1538 * LZCNT - count leading zero bits.
1539 */
1540IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1541{
1542 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1543}
1544
1545IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1546{
1547 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1548}
1549
1550IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1551{
1552 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1553}
1554
1555# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1556
1557IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1558{
1559 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1560}
1561
1562IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1563{
1564 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1565}
1566
1567IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1568{
1569 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1570}
1571
1572
1573IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1574{
1575 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1576}
1577
1578IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1579{
1580 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1581}
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1584{
1585 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1586}
1587
1588# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1589
1590
1591/*
1592 * TZCNT - count leading zero bits.
1593 */
1594IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1595{
1596 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1597}
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1600{
1601 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1607}
1608
1609# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1610
1611IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1612{
1613 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1614}
1615
1616IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1617{
1618 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1619}
1620
1621IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1622{
1623 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1624}
1625
1626
1627IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1628{
1629 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1630}
1631
1632IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1633{
1634 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1635}
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1638{
1639 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1640}
1641
1642# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1643#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1644
1645/*
1646 * BEXTR (BMI1 instruction)
1647 */
1648#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1649IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1650 a_Type uSrc2, uint32_t *pfEFlags)) \
1651{ \
1652 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1653 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1654 a_Type uResult; \
1655 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1656 if (iFirstBit < a_cBits) \
1657 { \
1658 uResult = uSrc1 >> iFirstBit; \
1659 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1660 if (cBits < a_cBits) \
1661 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1662 *puDst = uResult; \
1663 if (!uResult) \
1664 fEfl |= X86_EFL_ZF; \
1665 } \
1666 else \
1667 { \
1668 *puDst = uResult = 0; \
1669 fEfl |= X86_EFL_ZF; \
1670 } \
1671 /** @todo complete flag calculations. */ \
1672 *pfEFlags = fEfl; \
1673}
1674
1675EMIT_BEXTR(64, uint64_t, _fallback)
1676EMIT_BEXTR(32, uint32_t, _fallback)
1677#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1678EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1679#endif
1680#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1681EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1682#endif
1683
1684/*
1685 * BLSR (BMI1 instruction)
1686 */
1687#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1688IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1689{ \
1690 uint32_t fEfl1 = *pfEFlags; \
1691 uint32_t fEfl2 = fEfl1; \
1692 *puDst = uSrc; \
1693 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1694 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1695 \
1696 /* AMD: The carry flag is from the SUB operation. */ \
1697 /* 10890xe: PF always cleared? */ \
1698 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1699 fEfl2 |= fEfl1 & X86_EFL_CF; \
1700 *pfEFlags = fEfl2; \
1701}
1702
1703EMIT_BLSR(64, uint64_t, _fallback)
1704EMIT_BLSR(32, uint32_t, _fallback)
1705#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1706EMIT_BLSR(64, uint64_t, RT_NOTHING)
1707#endif
1708#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1709EMIT_BLSR(32, uint32_t, RT_NOTHING)
1710#endif
1711
1712/*
1713 * BLSMSK (BMI1 instruction)
1714 */
1715#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1716IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1717{ \
1718 uint32_t fEfl1 = *pfEFlags; \
1719 uint32_t fEfl2 = fEfl1; \
1720 *puDst = uSrc; \
1721 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1722 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1723 \
1724 /* AMD: The carry flag is from the SUB operation. */ \
1725 /* 10890xe: PF always cleared? */ \
1726 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1727 fEfl2 |= fEfl1 & X86_EFL_CF; \
1728 *pfEFlags = fEfl2; \
1729}
1730
1731EMIT_BLSMSK(64, uint64_t, _fallback)
1732EMIT_BLSMSK(32, uint32_t, _fallback)
1733#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1734EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1735#endif
1736#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1737EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1738#endif
1739
1740/*
1741 * BLSI (BMI1 instruction)
1742 */
1743#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1744IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1745{ \
1746 uint32_t fEfl1 = *pfEFlags; \
1747 uint32_t fEfl2 = fEfl1; \
1748 *puDst = uSrc; \
1749 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1750 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1751 \
1752 /* AMD: The carry flag is from the SUB operation. */ \
1753 /* 10890xe: PF always cleared? */ \
1754 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1755 fEfl2 |= fEfl1 & X86_EFL_CF; \
1756 *pfEFlags = fEfl2; \
1757}
1758
1759EMIT_BLSI(64, uint64_t, _fallback)
1760EMIT_BLSI(32, uint32_t, _fallback)
1761#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1762EMIT_BLSI(64, uint64_t, RT_NOTHING)
1763#endif
1764#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1765EMIT_BLSI(32, uint32_t, RT_NOTHING)
1766#endif
1767
1768/*
1769 * BZHI (BMI2 instruction)
1770 */
1771#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1772IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1773 a_Type uSrc2, uint32_t *pfEFlags)) \
1774{ \
1775 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1776 a_Type uResult; \
1777 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1778 if (iFirstBit < a_cBits) \
1779 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1780 else \
1781 { \
1782 uResult = uSrc1; \
1783 fEfl |= X86_EFL_CF; \
1784 } \
1785 *puDst = uResult; \
1786 fEfl |= X86_EFL_CALC_ZF(uResult); \
1787 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1788 *pfEFlags = fEfl; \
1789}
1790
1791EMIT_BZHI(64, uint64_t, _fallback)
1792EMIT_BZHI(32, uint32_t, _fallback)
1793#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1794EMIT_BZHI(64, uint64_t, RT_NOTHING)
1795#endif
1796#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1797EMIT_BZHI(32, uint32_t, RT_NOTHING)
1798#endif
1799
1800/*
1801 * POPCNT
1802 */
1803RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1804{
1805 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1806 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1807 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1808 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1809};
1810
1811/** @todo Use native popcount where possible and employ some more efficient
1812 * algorithm here (or in asm.h fallback)! */
1813
1814DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1815{
1816 return g_abBitCounts6[ u16 & 0x3f]
1817 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1818 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1819}
1820
1821DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1822{
1823 return g_abBitCounts6[ u32 & 0x3f]
1824 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1825 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1826 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1827 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1828 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1829}
1830
1831DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1832{
1833 return g_abBitCounts6[ u64 & 0x3f]
1834 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1835 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1836 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1837 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1838 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1839 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1840 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1841 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1842 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1843 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1844}
1845
1846#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1848{ \
1849 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1850 a_Type uResult; \
1851 if (uSrc) \
1852 uResult = iemPopCountU ## a_cBits(uSrc); \
1853 else \
1854 { \
1855 fEfl |= X86_EFL_ZF; \
1856 uResult = 0; \
1857 } \
1858 *puDst = uResult; \
1859 *pfEFlags = fEfl; \
1860}
1861
1862EMIT_POPCNT(64, uint64_t, _fallback)
1863EMIT_POPCNT(32, uint32_t, _fallback)
1864EMIT_POPCNT(16, uint16_t, _fallback)
1865#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1866EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1867#endif
1868#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1869EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1870EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1871#endif
1872
1873
1874#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1875
1876/*
1877 * XCHG
1878 */
1879
1880IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1881{
1882#if ARCH_BITS >= 64
1883 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1884#else
1885 uint64_t uOldMem = *puMem;
1886 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1887 ASMNopPause();
1888 *puReg = uOldMem;
1889#endif
1890}
1891
1892# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1893
1894IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1895{
1896 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1897}
1898
1899
1900IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1901{
1902 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1903}
1904
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1907{
1908 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1909}
1910
1911# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1912
1913
1914/* Unlocked variants for fDisregardLock mode: */
1915
1916IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1917{
1918 uint64_t const uOld = *puMem;
1919 *puMem = *puReg;
1920 *puReg = uOld;
1921}
1922
1923# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1924
1925IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1926{
1927 uint32_t const uOld = *puMem;
1928 *puMem = *puReg;
1929 *puReg = uOld;
1930}
1931
1932
1933IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1934{
1935 uint16_t const uOld = *puMem;
1936 *puMem = *puReg;
1937 *puReg = uOld;
1938}
1939
1940
1941IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1942{
1943 uint8_t const uOld = *puMem;
1944 *puMem = *puReg;
1945 *puReg = uOld;
1946}
1947
1948# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1949
1950
1951/*
1952 * XADD and LOCK XADD.
1953 */
1954#define EMIT_XADD(a_cBitsWidth, a_Type) \
1955IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1956{ \
1957 a_Type uDst = *puDst; \
1958 a_Type uResult = uDst; \
1959 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1960 *puDst = uResult; \
1961 *puReg = uDst; \
1962} \
1963\
1964IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1965{ \
1966 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1967 a_Type uResult; \
1968 uint32_t fEflTmp; \
1969 do \
1970 { \
1971 uResult = uOld; \
1972 fEflTmp = *pfEFlags; \
1973 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
1974 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
1975 *puReg = uOld; \
1976 *pfEFlags = fEflTmp; \
1977}
1978EMIT_XADD(64, uint64_t)
1979# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1980EMIT_XADD(32, uint32_t)
1981EMIT_XADD(16, uint16_t)
1982EMIT_XADD(8, uint8_t)
1983# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1984
1985#endif
1986
1987/*
1988 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
1989 *
1990 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
1991 * instructions are emulated as locked.
1992 */
1993#if defined(IEM_WITHOUT_ASSEMBLY)
1994
1995IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
1996{
1997 uint8_t uOld = *puAl;
1998 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
1999 Assert(*puAl == uOld);
2000 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2001}
2002
2003
2004IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2005{
2006 uint16_t uOld = *puAx;
2007 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2008 Assert(*puAx == uOld);
2009 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2010}
2011
2012
2013IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2014{
2015 uint32_t uOld = *puEax;
2016 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2017 Assert(*puEax == uOld);
2018 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2019}
2020
2021
2022# if ARCH_BITS == 32
2023IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2024# else
2025IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2026# endif
2027{
2028# if ARCH_BITS == 32
2029 uint64_t const uSrcReg = *puSrcReg;
2030# endif
2031 uint64_t uOld = *puRax;
2032 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2033 Assert(*puRax == uOld);
2034 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2035}
2036
2037
2038IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2039 uint32_t *pEFlags))
2040{
2041 uint64_t const uNew = pu64EbxEcx->u;
2042 uint64_t const uOld = pu64EaxEdx->u;
2043 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2044 {
2045 Assert(pu64EaxEdx->u == uOld);
2046 *pEFlags |= X86_EFL_ZF;
2047 }
2048 else
2049 *pEFlags &= ~X86_EFL_ZF;
2050}
2051
2052
2053# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2054IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2055 uint32_t *pEFlags))
2056{
2057# ifdef VBOX_STRICT
2058 RTUINT128U const uOld = *pu128RaxRdx;
2059# endif
2060# if defined(RT_ARCH_AMD64)
2061 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2062 &pu128RaxRdx->u))
2063# else
2064 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2065# endif
2066 {
2067 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2068 *pEFlags |= X86_EFL_ZF;
2069 }
2070 else
2071 *pEFlags &= ~X86_EFL_ZF;
2072}
2073# endif
2074
2075#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2076
2077# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2078IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2079 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2080{
2081 RTUINT128U u128Tmp = *pu128Dst;
2082 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2083 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2084 {
2085 *pu128Dst = *pu128RbxRcx;
2086 *pEFlags |= X86_EFL_ZF;
2087 }
2088 else
2089 {
2090 *pu128RaxRdx = u128Tmp;
2091 *pEFlags &= ~X86_EFL_ZF;
2092 }
2093}
2094#endif /* !RT_ARCH_ARM64 */
2095
2096#if defined(IEM_WITHOUT_ASSEMBLY)
2097
2098/* Unlocked versions mapped to the locked ones: */
2099
2100IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2101{
2102 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2103}
2104
2105
2106IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2107{
2108 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2109}
2110
2111
2112IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2113{
2114 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2115}
2116
2117
2118# if ARCH_BITS == 32
2119IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2120{
2121 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2122}
2123# else
2124IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2125{
2126 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2127}
2128# endif
2129
2130
2131IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2132{
2133 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2134}
2135
2136
2137IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2138 uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2141}
2142
2143#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2144
2145#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2146 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2147
2148/*
2149 * MUL, IMUL, DIV and IDIV helpers.
2150 *
2151 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2152 * division step so we can select between using C operators and
2153 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2154 *
2155 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2156 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2157 * input loads and the result storing.
2158 */
2159
2160DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2161{
2162# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2163 pQuotient->s.Lo = 0;
2164 pQuotient->s.Hi = 0;
2165# endif
2166 RTUINT128U Divisor;
2167 Divisor.s.Lo = u64Divisor;
2168 Divisor.s.Hi = 0;
2169 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2170}
2171
2172# define DIV_LOAD(a_Dividend) \
2173 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2174# define DIV_LOAD_U8(a_Dividend) \
2175 a_Dividend.u = *puAX
2176
2177# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2178# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2179
2180# define MUL_LOAD_F1() *puA
2181# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2182
2183# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2184# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2185
2186# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2187 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2188# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2189 RTUInt128AssignNeg(&(a_Value))
2190
2191# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2192 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2193# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2194 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2195
2196# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2197 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2198 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2199# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2200 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2201
2202
2203/*
2204 * MUL
2205 */
2206# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2207IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2208{ \
2209 RTUINT ## a_cBitsWidth2x ## U Result; \
2210 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2211 a_fnStore(Result); \
2212 \
2213 /* Calc EFLAGS: */ \
2214 uint32_t fEfl = *pfEFlags; \
2215 if (a_fIntelFlags) \
2216 { /* Intel: 6700K and 10980XE behavior */ \
2217 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2218 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2219 fEfl |= X86_EFL_SF; \
2220 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2221 if (Result.s.Hi != 0) \
2222 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2223 } \
2224 else \
2225 { /* AMD: 3990X */ \
2226 if (Result.s.Hi != 0) \
2227 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2228 else \
2229 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2230 } \
2231 *pfEFlags = fEfl; \
2232 return 0; \
2233} \
2234
2235# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2236 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2237 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2238 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2239
2240# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2241EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2242 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2243# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2244EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2245 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2246EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2247 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2248EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2249 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2250# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2251# endif /* !DOXYGEN_RUNNING */
2252
2253/*
2254 * MULX
2255 */
2256# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2257IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2258 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2259{ \
2260 RTUINT ## a_cBitsWidth2x ## U Result; \
2261 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2262 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2263 *puDst1 = Result.s.Hi; \
2264} \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2268EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2271EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2272# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2273# endif /* !DOXYGEN_RUNNING */
2274
2275
2276/*
2277 * IMUL
2278 *
2279 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2280 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2281 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2282 */
2283# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2284 a_Suffix, a_fIntelFlags) \
2285IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2286{ \
2287 RTUINT ## a_cBitsWidth2x ## U Result; \
2288 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2289 \
2290 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2291 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2292 { \
2293 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2294 { \
2295 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2296 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2297 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2298 } \
2299 else \
2300 { \
2301 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2302 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2303 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2304 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2305 a_fnNeg(Result, a_cBitsWidth2x); \
2306 } \
2307 } \
2308 else \
2309 { \
2310 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2311 { \
2312 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2313 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2314 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2315 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2316 a_fnNeg(Result, a_cBitsWidth2x); \
2317 } \
2318 else \
2319 { \
2320 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2321 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2322 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2323 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2324 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2325 } \
2326 } \
2327 a_fnStore(Result); \
2328 \
2329 if (a_fIntelFlags) \
2330 { \
2331 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2332 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2333 fEfl |= X86_EFL_SF; \
2334 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2335 } \
2336 *pfEFlags = fEfl; \
2337 return 0; \
2338}
2339# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2340 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2341 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2342 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2343
2344# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2345EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2346 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2347# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2348EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2349 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2350EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2351 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2352EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2353 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2354# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2355# endif /* !DOXYGEN_RUNNING */
2356
2357
2358/*
2359 * IMUL with two operands are mapped onto the three operand variant, ignoring
2360 * the high part of the product.
2361 */
2362# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2363IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2364{ \
2365 a_uType uIgn; \
2366 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2367} \
2368\
2369IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2370{ \
2371 a_uType uIgn; \
2372 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2373} \
2374\
2375IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2376{ \
2377 a_uType uIgn; \
2378 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2379}
2380
2381EMIT_IMUL_TWO(64, uint64_t)
2382# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2383EMIT_IMUL_TWO(32, uint32_t)
2384EMIT_IMUL_TWO(16, uint16_t)
2385# endif
2386
2387
2388/*
2389 * DIV
2390 */
2391# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2392 a_Suffix, a_fIntelFlags) \
2393IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2394{ \
2395 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2396 a_fnLoad(Dividend); \
2397 if ( uDivisor != 0 \
2398 && Dividend.s.Hi < uDivisor) \
2399 { \
2400 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2401 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2402 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2403 \
2404 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2405 if (!a_fIntelFlags) \
2406 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2407 return 0; \
2408 } \
2409 /* #DE */ \
2410 return -1; \
2411}
2412# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2413 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2414 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2415 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2416
2417# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2418EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2419 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2420# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2421EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2422 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2423EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2424 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2425EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2426 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2427# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2428# endif /* !DOXYGEN_RUNNING */
2429
2430
2431/*
2432 * IDIV
2433 *
2434 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2435 * set AF and clear PF, ZF and SF just like it does for DIV.
2436 *
2437 */
2438# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2439 a_Suffix, a_fIntelFlags) \
2440IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2441{ \
2442 /* Note! Skylake leaves all flags alone. */ \
2443 \
2444 /** @todo overflow checks */ \
2445 if (uDivisor != 0) \
2446 { \
2447 /* \
2448 * Convert to unsigned division. \
2449 */ \
2450 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2451 a_fnLoad(Dividend); \
2452 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2453 if (fSignedDividend) \
2454 a_fnNeg(Dividend, a_cBitsWidth2x); \
2455 \
2456 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2457 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2458 uDivisorPositive = uDivisor; \
2459 else \
2460 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2461 \
2462 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2463 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2464 \
2465 /* \
2466 * Setup the result, checking for overflows. \
2467 */ \
2468 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2469 { \
2470 if (!fSignedDividend) \
2471 { \
2472 /* Positive divisor, positive dividend => result positive. */ \
2473 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2474 { \
2475 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2476 if (!a_fIntelFlags) \
2477 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2478 return 0; \
2479 } \
2480 } \
2481 else \
2482 { \
2483 /* Positive divisor, negative dividend => result negative. */ \
2484 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2485 { \
2486 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2487 if (!a_fIntelFlags) \
2488 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2489 return 0; \
2490 } \
2491 } \
2492 } \
2493 else \
2494 { \
2495 if (!fSignedDividend) \
2496 { \
2497 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2498 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2499 { \
2500 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2501 if (!a_fIntelFlags) \
2502 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2503 return 0; \
2504 } \
2505 } \
2506 else \
2507 { \
2508 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2509 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2510 { \
2511 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2512 if (!a_fIntelFlags) \
2513 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2514 return 0; \
2515 } \
2516 } \
2517 } \
2518 } \
2519 /* #DE */ \
2520 return -1; \
2521}
2522# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2523 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2524 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2525 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2526
2527# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2528EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2529 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2530# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2531EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2532 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2533EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2534 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2535EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2536 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2537# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2538# endif /* !DOXYGEN_RUNNING */
2539
2540#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2541
2542
2543/*********************************************************************************************************************************
2544* Unary operations. *
2545*********************************************************************************************************************************/
2546#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2547
2548/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2549 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2550 *
2551 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2552 * borrowing in arithmetic loops on intel 8008).
2553 *
2554 * @returns Status bits.
2555 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2556 * @param a_uResult Unsigned result value.
2557 * @param a_uDst The original destination value (for AF calc).
2558 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2559 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2560 */
2561#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2562 do { \
2563 uint32_t fEflTmp = *(a_pfEFlags); \
2564 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2565 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2566 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2567 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2568 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2569 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2570 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2571 *(a_pfEFlags) = fEflTmp; \
2572 } while (0)
2573
2574/*
2575 * INC
2576 */
2577
2578IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2579{
2580 uint64_t uDst = *puDst;
2581 uint64_t uResult = uDst + 1;
2582 *puDst = uResult;
2583 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2584}
2585
2586# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2587
2588IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2589{
2590 uint32_t uDst = *puDst;
2591 uint32_t uResult = uDst + 1;
2592 *puDst = uResult;
2593 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2594}
2595
2596
2597IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2598{
2599 uint16_t uDst = *puDst;
2600 uint16_t uResult = uDst + 1;
2601 *puDst = uResult;
2602 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2603}
2604
2605IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2606{
2607 uint8_t uDst = *puDst;
2608 uint8_t uResult = uDst + 1;
2609 *puDst = uResult;
2610 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2611}
2612
2613# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2614
2615
2616/*
2617 * DEC
2618 */
2619
2620IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2621{
2622 uint64_t uDst = *puDst;
2623 uint64_t uResult = uDst - 1;
2624 *puDst = uResult;
2625 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2626}
2627
2628# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2629
2630IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2631{
2632 uint32_t uDst = *puDst;
2633 uint32_t uResult = uDst - 1;
2634 *puDst = uResult;
2635 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2636}
2637
2638
2639IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2640{
2641 uint16_t uDst = *puDst;
2642 uint16_t uResult = uDst - 1;
2643 *puDst = uResult;
2644 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2645}
2646
2647
2648IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2649{
2650 uint8_t uDst = *puDst;
2651 uint8_t uResult = uDst - 1;
2652 *puDst = uResult;
2653 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2654}
2655
2656# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2657
2658
2659/*
2660 * NOT
2661 */
2662
2663IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2664{
2665 uint64_t uDst = *puDst;
2666 uint64_t uResult = ~uDst;
2667 *puDst = uResult;
2668 /* EFLAGS are not modified. */
2669 RT_NOREF_PV(pfEFlags);
2670}
2671
2672# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint32_t uDst = *puDst;
2677 uint32_t uResult = ~uDst;
2678 *puDst = uResult;
2679 /* EFLAGS are not modified. */
2680 RT_NOREF_PV(pfEFlags);
2681}
2682
2683IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2684{
2685 uint16_t uDst = *puDst;
2686 uint16_t uResult = ~uDst;
2687 *puDst = uResult;
2688 /* EFLAGS are not modified. */
2689 RT_NOREF_PV(pfEFlags);
2690}
2691
2692IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2693{
2694 uint8_t uDst = *puDst;
2695 uint8_t uResult = ~uDst;
2696 *puDst = uResult;
2697 /* EFLAGS are not modified. */
2698 RT_NOREF_PV(pfEFlags);
2699}
2700
2701# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2702
2703
2704/*
2705 * NEG
2706 */
2707
2708/**
2709 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2710 *
2711 * @returns Status bits.
2712 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2713 * @param a_uResult Unsigned result value.
2714 * @param a_uDst The original destination value (for AF calc).
2715 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2716 */
2717#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2718 do { \
2719 uint32_t fEflTmp = *(a_pfEFlags); \
2720 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2721 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2722 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2723 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2724 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2725 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2726 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2727 *(a_pfEFlags) = fEflTmp; \
2728 } while (0)
2729
2730IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2731{
2732 uint64_t uDst = *puDst;
2733 uint64_t uResult = (uint64_t)0 - uDst;
2734 *puDst = uResult;
2735 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2736}
2737
2738# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2739
2740IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2741{
2742 uint32_t uDst = *puDst;
2743 uint32_t uResult = (uint32_t)0 - uDst;
2744 *puDst = uResult;
2745 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2746}
2747
2748
2749IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2750{
2751 uint16_t uDst = *puDst;
2752 uint16_t uResult = (uint16_t)0 - uDst;
2753 *puDst = uResult;
2754 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2755}
2756
2757
2758IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2759{
2760 uint8_t uDst = *puDst;
2761 uint8_t uResult = (uint8_t)0 - uDst;
2762 *puDst = uResult;
2763 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2764}
2765
2766# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2767
2768/*
2769 * Locked variants.
2770 */
2771
2772/** Emit a function for doing a locked unary operand operation. */
2773# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2774 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2775 uint32_t *pfEFlags)) \
2776 { \
2777 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2778 uint ## a_cBitsWidth ## _t uTmp; \
2779 uint32_t fEflTmp; \
2780 do \
2781 { \
2782 uTmp = uOld; \
2783 fEflTmp = *pfEFlags; \
2784 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2785 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2786 *pfEFlags = fEflTmp; \
2787 }
2788
2789EMIT_LOCKED_UNARY_OP(inc, 64)
2790EMIT_LOCKED_UNARY_OP(dec, 64)
2791EMIT_LOCKED_UNARY_OP(not, 64)
2792EMIT_LOCKED_UNARY_OP(neg, 64)
2793# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2794EMIT_LOCKED_UNARY_OP(inc, 32)
2795EMIT_LOCKED_UNARY_OP(dec, 32)
2796EMIT_LOCKED_UNARY_OP(not, 32)
2797EMIT_LOCKED_UNARY_OP(neg, 32)
2798
2799EMIT_LOCKED_UNARY_OP(inc, 16)
2800EMIT_LOCKED_UNARY_OP(dec, 16)
2801EMIT_LOCKED_UNARY_OP(not, 16)
2802EMIT_LOCKED_UNARY_OP(neg, 16)
2803
2804EMIT_LOCKED_UNARY_OP(inc, 8)
2805EMIT_LOCKED_UNARY_OP(dec, 8)
2806EMIT_LOCKED_UNARY_OP(not, 8)
2807EMIT_LOCKED_UNARY_OP(neg, 8)
2808# endif
2809
2810#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2811
2812
2813/*********************************************************************************************************************************
2814* Shifting and Rotating *
2815*********************************************************************************************************************************/
2816
2817/*
2818 * ROL
2819 */
2820#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2821IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2822{ \
2823 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2824 if (cShift) \
2825 { \
2826 if (a_cBitsWidth < 32) \
2827 cShift &= a_cBitsWidth - 1; \
2828 a_uType const uDst = *puDst; \
2829 a_uType const uResult = a_fnHlp(uDst, cShift); \
2830 *puDst = uResult; \
2831 \
2832 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2833 it the same way as for 1 bit shifts. */ \
2834 AssertCompile(X86_EFL_CF_BIT == 0); \
2835 uint32_t fEfl = *pfEFlags; \
2836 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2837 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2838 fEfl |= fCarry; \
2839 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2840 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2841 else /* Intel 10980XE: According to the first sub-shift: */ \
2842 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2843 *pfEFlags = fEfl; \
2844 } \
2845}
2846
2847#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2848EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2849#endif
2850EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2851EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2852
2853#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2854EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2855#endif
2856EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2857EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2858
2859DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2860{
2861 return (uValue << cShift) | (uValue >> (16 - cShift));
2862}
2863#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2864EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2865#endif
2866EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2867EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2868
2869DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2870{
2871 return (uValue << cShift) | (uValue >> (8 - cShift));
2872}
2873#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2875#endif
2876EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2877EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2878
2879
2880/*
2881 * ROR
2882 */
2883#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2884IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2885{ \
2886 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2887 if (cShift) \
2888 { \
2889 if (a_cBitsWidth < 32) \
2890 cShift &= a_cBitsWidth - 1; \
2891 a_uType const uDst = *puDst; \
2892 a_uType const uResult = a_fnHlp(uDst, cShift); \
2893 *puDst = uResult; \
2894 \
2895 /* Calc EFLAGS: */ \
2896 AssertCompile(X86_EFL_CF_BIT == 0); \
2897 uint32_t fEfl = *pfEFlags; \
2898 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2899 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2900 fEfl |= fCarry; \
2901 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2902 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2903 else /* Intel 10980XE: According to the first sub-shift: */ \
2904 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2905 *pfEFlags = fEfl; \
2906 } \
2907}
2908
2909#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2910EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2911#endif
2912EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2913EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2914
2915#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2916EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2917#endif
2918EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2919EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2920
2921DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2922{
2923 return (uValue >> cShift) | (uValue << (16 - cShift));
2924}
2925#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2926EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2927#endif
2928EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2929EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2930
2931DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2932{
2933 return (uValue >> cShift) | (uValue << (8 - cShift));
2934}
2935#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2937#endif
2938EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2939EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2940
2941
2942/*
2943 * RCL
2944 */
2945#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2946IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2947{ \
2948 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2949 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2950 cShift %= a_cBitsWidth + 1; \
2951 if (cShift) \
2952 { \
2953 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2954 cShift %= a_cBitsWidth + 1; \
2955 a_uType const uDst = *puDst; \
2956 a_uType uResult = uDst << cShift; \
2957 if (cShift > 1) \
2958 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2959 \
2960 AssertCompile(X86_EFL_CF_BIT == 0); \
2961 uint32_t fEfl = *pfEFlags; \
2962 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2963 uResult |= (a_uType)fInCarry << (cShift - 1); \
2964 \
2965 *puDst = uResult; \
2966 \
2967 /* Calc EFLAGS. */ \
2968 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2969 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2970 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2971 fEfl |= fOutCarry; \
2972 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2973 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
2974 else /* Intel 10980XE: According to the first sub-shift: */ \
2975 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2976 *pfEFlags = fEfl; \
2977 } \
2978}
2979
2980#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2981EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
2982#endif
2983EMIT_RCL(64, uint64_t, _intel, 1)
2984EMIT_RCL(64, uint64_t, _amd, 0)
2985
2986#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2987EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
2988#endif
2989EMIT_RCL(32, uint32_t, _intel, 1)
2990EMIT_RCL(32, uint32_t, _amd, 0)
2991
2992#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2993EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
2994#endif
2995EMIT_RCL(16, uint16_t, _intel, 1)
2996EMIT_RCL(16, uint16_t, _amd, 0)
2997
2998#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2999EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3000#endif
3001EMIT_RCL(8, uint8_t, _intel, 1)
3002EMIT_RCL(8, uint8_t, _amd, 0)
3003
3004
3005/*
3006 * RCR
3007 */
3008#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3009IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3010{ \
3011 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3012 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3013 cShift %= a_cBitsWidth + 1; \
3014 if (cShift) \
3015 { \
3016 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3017 cShift %= a_cBitsWidth + 1; \
3018 a_uType const uDst = *puDst; \
3019 a_uType uResult = uDst >> cShift; \
3020 if (cShift > 1) \
3021 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3022 \
3023 AssertCompile(X86_EFL_CF_BIT == 0); \
3024 uint32_t fEfl = *pfEFlags; \
3025 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3026 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3027 *puDst = uResult; \
3028 \
3029 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3030 it the same way as for 1 bit shifts. */ \
3031 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3032 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3033 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3034 fEfl |= fOutCarry; \
3035 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3036 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3037 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3038 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3039 *pfEFlags = fEfl; \
3040 } \
3041}
3042
3043#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3044EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3045#endif
3046EMIT_RCR(64, uint64_t, _intel, 1)
3047EMIT_RCR(64, uint64_t, _amd, 0)
3048
3049#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3050EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3051#endif
3052EMIT_RCR(32, uint32_t, _intel, 1)
3053EMIT_RCR(32, uint32_t, _amd, 0)
3054
3055#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3056EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3057#endif
3058EMIT_RCR(16, uint16_t, _intel, 1)
3059EMIT_RCR(16, uint16_t, _amd, 0)
3060
3061#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3062EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3063#endif
3064EMIT_RCR(8, uint8_t, _intel, 1)
3065EMIT_RCR(8, uint8_t, _amd, 0)
3066
3067
3068/*
3069 * SHL
3070 */
3071#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3072IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3073{ \
3074 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3075 if (cShift) \
3076 { \
3077 a_uType const uDst = *puDst; \
3078 a_uType uResult = uDst << cShift; \
3079 *puDst = uResult; \
3080 \
3081 /* Calc EFLAGS. */ \
3082 AssertCompile(X86_EFL_CF_BIT == 0); \
3083 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3084 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3085 fEfl |= fCarry; \
3086 if (!a_fIntelFlags) \
3087 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3088 else \
3089 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3090 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3091 fEfl |= X86_EFL_CALC_ZF(uResult); \
3092 fEfl |= g_afParity[uResult & 0xff]; \
3093 if (!a_fIntelFlags) \
3094 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3095 *pfEFlags = fEfl; \
3096 } \
3097}
3098
3099#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3100EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3101#endif
3102EMIT_SHL(64, uint64_t, _intel, 1)
3103EMIT_SHL(64, uint64_t, _amd, 0)
3104
3105#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3106EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3107#endif
3108EMIT_SHL(32, uint32_t, _intel, 1)
3109EMIT_SHL(32, uint32_t, _amd, 0)
3110
3111#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3112EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3113#endif
3114EMIT_SHL(16, uint16_t, _intel, 1)
3115EMIT_SHL(16, uint16_t, _amd, 0)
3116
3117#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3118EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3119#endif
3120EMIT_SHL(8, uint8_t, _intel, 1)
3121EMIT_SHL(8, uint8_t, _amd, 0)
3122
3123
3124/*
3125 * SHR
3126 */
3127#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3128IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3129{ \
3130 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3131 if (cShift) \
3132 { \
3133 a_uType const uDst = *puDst; \
3134 a_uType uResult = uDst >> cShift; \
3135 *puDst = uResult; \
3136 \
3137 /* Calc EFLAGS. */ \
3138 AssertCompile(X86_EFL_CF_BIT == 0); \
3139 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3140 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3141 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3142 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3143 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3144 fEfl |= X86_EFL_CALC_ZF(uResult); \
3145 fEfl |= g_afParity[uResult & 0xff]; \
3146 if (!a_fIntelFlags) \
3147 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3148 *pfEFlags = fEfl; \
3149 } \
3150}
3151
3152#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3153EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3154#endif
3155EMIT_SHR(64, uint64_t, _intel, 1)
3156EMIT_SHR(64, uint64_t, _amd, 0)
3157
3158#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3159EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3160#endif
3161EMIT_SHR(32, uint32_t, _intel, 1)
3162EMIT_SHR(32, uint32_t, _amd, 0)
3163
3164#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3165EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3166#endif
3167EMIT_SHR(16, uint16_t, _intel, 1)
3168EMIT_SHR(16, uint16_t, _amd, 0)
3169
3170#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3171EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3172#endif
3173EMIT_SHR(8, uint8_t, _intel, 1)
3174EMIT_SHR(8, uint8_t, _amd, 0)
3175
3176
3177/*
3178 * SAR
3179 */
3180#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3181IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3182{ \
3183 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3184 if (cShift) \
3185 { \
3186 a_iType const iDst = (a_iType)*puDst; \
3187 a_uType uResult = iDst >> cShift; \
3188 *puDst = uResult; \
3189 \
3190 /* Calc EFLAGS. \
3191 Note! The OF flag is always zero because the result never differs from the input. */ \
3192 AssertCompile(X86_EFL_CF_BIT == 0); \
3193 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3194 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3195 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3196 fEfl |= X86_EFL_CALC_ZF(uResult); \
3197 fEfl |= g_afParity[uResult & 0xff]; \
3198 if (!a_fIntelFlags) \
3199 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3200 *pfEFlags = fEfl; \
3201 } \
3202}
3203
3204#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3205EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3206#endif
3207EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3208EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3209
3210#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3211EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3212#endif
3213EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3214EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3215
3216#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3217EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3218#endif
3219EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3220EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3221
3222#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3223EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3224#endif
3225EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3226EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3227
3228
3229/*
3230 * SHLD
3231 *
3232 * - CF is the last bit shifted out of puDst.
3233 * - AF is always cleared by Intel 10980XE.
3234 * - AF is always set by AMD 3990X.
3235 * - OF is set according to the first shift on Intel 10980XE, it seems.
3236 * - OF is set according to the last sub-shift on AMD 3990X.
3237 * - ZF, SF and PF are calculated according to the result by both vendors.
3238 *
3239 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3240 * pick either the source register or the destination register for input bits
3241 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3242 * intel has changed behaviour here several times. We implement what current
3243 * skylake based does for now, we can extend this later as needed.
3244 */
3245#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3246IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3247 uint32_t *pfEFlags)) \
3248{ \
3249 cShift &= a_cBitsWidth - 1; \
3250 if (cShift) \
3251 { \
3252 a_uType const uDst = *puDst; \
3253 a_uType uResult = uDst << cShift; \
3254 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3255 *puDst = uResult; \
3256 \
3257 /* CALC EFLAGS: */ \
3258 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3259 if (a_fIntelFlags) \
3260 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3261 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3262 else \
3263 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3264 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3265 fEfl |= X86_EFL_AF; \
3266 } \
3267 AssertCompile(X86_EFL_CF_BIT == 0); \
3268 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3269 fEfl |= g_afParity[uResult & 0xff]; \
3270 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3271 fEfl |= X86_EFL_CALC_ZF(uResult); \
3272 *pfEFlags = fEfl; \
3273 } \
3274}
3275
3276#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3277EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3278#endif
3279EMIT_SHLD(64, uint64_t, _intel, 1)
3280EMIT_SHLD(64, uint64_t, _amd, 0)
3281
3282#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3283EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3284#endif
3285EMIT_SHLD(32, uint32_t, _intel, 1)
3286EMIT_SHLD(32, uint32_t, _amd, 0)
3287
3288#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3289IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3290{ \
3291 cShift &= 31; \
3292 if (cShift) \
3293 { \
3294 uint16_t const uDst = *puDst; \
3295 uint64_t const uTmp = a_fIntelFlags \
3296 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3297 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3298 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3299 *puDst = uResult; \
3300 \
3301 /* CALC EFLAGS: */ \
3302 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3303 AssertCompile(X86_EFL_CF_BIT == 0); \
3304 if (a_fIntelFlags) \
3305 { \
3306 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3307 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3308 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3309 } \
3310 else \
3311 { \
3312 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3313 if (cShift < 16) \
3314 { \
3315 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3316 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3317 } \
3318 else \
3319 { \
3320 if (cShift == 16) \
3321 fEfl |= uDst & X86_EFL_CF; \
3322 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3323 } \
3324 fEfl |= X86_EFL_AF; \
3325 } \
3326 fEfl |= g_afParity[uResult & 0xff]; \
3327 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3328 fEfl |= X86_EFL_CALC_ZF(uResult); \
3329 *pfEFlags = fEfl; \
3330 } \
3331}
3332
3333#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3334EMIT_SHLD_16(RT_NOTHING, 1)
3335#endif
3336EMIT_SHLD_16(_intel, 1)
3337EMIT_SHLD_16(_amd, 0)
3338
3339
3340/*
3341 * SHRD
3342 *
3343 * EFLAGS behaviour seems to be the same as with SHLD:
3344 * - CF is the last bit shifted out of puDst.
3345 * - AF is always cleared by Intel 10980XE.
3346 * - AF is always set by AMD 3990X.
3347 * - OF is set according to the first shift on Intel 10980XE, it seems.
3348 * - OF is set according to the last sub-shift on AMD 3990X.
3349 * - ZF, SF and PF are calculated according to the result by both vendors.
3350 *
3351 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3352 * pick either the source register or the destination register for input bits
3353 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3354 * intel has changed behaviour here several times. We implement what current
3355 * skylake based does for now, we can extend this later as needed.
3356 */
3357#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3358IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3359{ \
3360 cShift &= a_cBitsWidth - 1; \
3361 if (cShift) \
3362 { \
3363 a_uType const uDst = *puDst; \
3364 a_uType uResult = uDst >> cShift; \
3365 uResult |= uSrc << (a_cBitsWidth - cShift); \
3366 *puDst = uResult; \
3367 \
3368 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3369 AssertCompile(X86_EFL_CF_BIT == 0); \
3370 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3371 if (a_fIntelFlags) \
3372 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3373 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3374 else \
3375 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3376 if (cShift > 1) /* Set according to last shift. */ \
3377 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3378 else \
3379 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3380 fEfl |= X86_EFL_AF; \
3381 } \
3382 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3383 fEfl |= X86_EFL_CALC_ZF(uResult); \
3384 fEfl |= g_afParity[uResult & 0xff]; \
3385 *pfEFlags = fEfl; \
3386 } \
3387}
3388
3389#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3390EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3391#endif
3392EMIT_SHRD(64, uint64_t, _intel, 1)
3393EMIT_SHRD(64, uint64_t, _amd, 0)
3394
3395#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3396EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3397#endif
3398EMIT_SHRD(32, uint32_t, _intel, 1)
3399EMIT_SHRD(32, uint32_t, _amd, 0)
3400
3401#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3402IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3403{ \
3404 cShift &= 31; \
3405 if (cShift) \
3406 { \
3407 uint16_t const uDst = *puDst; \
3408 uint64_t const uTmp = a_fIntelFlags \
3409 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3410 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3411 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3412 *puDst = uResult; \
3413 \
3414 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3415 AssertCompile(X86_EFL_CF_BIT == 0); \
3416 if (a_fIntelFlags) \
3417 { \
3418 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3419 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3420 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3421 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3422 } \
3423 else \
3424 { \
3425 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3426 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3427 /* AMD 3990X: Set according to last shift. AF always set. */ \
3428 if (cShift > 1) /* Set according to last shift. */ \
3429 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3430 else \
3431 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3432 fEfl |= X86_EFL_AF; \
3433 } \
3434 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3435 fEfl |= X86_EFL_CALC_ZF(uResult); \
3436 fEfl |= g_afParity[uResult & 0xff]; \
3437 *pfEFlags = fEfl; \
3438 } \
3439}
3440
3441#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3442EMIT_SHRD_16(RT_NOTHING, 1)
3443#endif
3444EMIT_SHRD_16(_intel, 1)
3445EMIT_SHRD_16(_amd, 0)
3446
3447
3448/*
3449 * RORX (BMI2)
3450 */
3451#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3452IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3453{ \
3454 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3455}
3456
3457#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3458EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3459#endif
3460#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3461EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3462#endif
3463
3464
3465/*
3466 * SHLX (BMI2)
3467 */
3468#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3469IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3470{ \
3471 cShift &= a_cBitsWidth - 1; \
3472 *puDst = uSrc << cShift; \
3473}
3474
3475#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3476EMIT_SHLX(64, uint64_t, RT_NOTHING)
3477EMIT_SHLX(64, uint64_t, _fallback)
3478#endif
3479#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3480EMIT_SHLX(32, uint32_t, RT_NOTHING)
3481EMIT_SHLX(32, uint32_t, _fallback)
3482#endif
3483
3484
3485/*
3486 * SHRX (BMI2)
3487 */
3488#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3489IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3490{ \
3491 cShift &= a_cBitsWidth - 1; \
3492 *puDst = uSrc >> cShift; \
3493}
3494
3495#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3496EMIT_SHRX(64, uint64_t, RT_NOTHING)
3497EMIT_SHRX(64, uint64_t, _fallback)
3498#endif
3499#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3500EMIT_SHRX(32, uint32_t, RT_NOTHING)
3501EMIT_SHRX(32, uint32_t, _fallback)
3502#endif
3503
3504
3505/*
3506 * SARX (BMI2)
3507 */
3508#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3509IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3510{ \
3511 cShift &= a_cBitsWidth - 1; \
3512 *puDst = (a_iType)uSrc >> cShift; \
3513}
3514
3515#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3516EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3517EMIT_SARX(64, uint64_t, int64_t, _fallback)
3518#endif
3519#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3520EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3521EMIT_SARX(32, uint32_t, int32_t, _fallback)
3522#endif
3523
3524
3525/*
3526 * PDEP (BMI2)
3527 */
3528#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3529IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3530{ \
3531 a_uType uResult = 0; \
3532 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3533 if (fMask & ((a_uType)1 << iMaskBit)) \
3534 { \
3535 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3536 iBit++; \
3537 } \
3538 *puDst = uResult; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_PDEP(64, uint64_t, RT_NOTHING)
3543#endif
3544EMIT_PDEP(64, uint64_t, _fallback)
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_PDEP(32, uint32_t, RT_NOTHING)
3547#endif
3548EMIT_PDEP(32, uint32_t, _fallback)
3549
3550/*
3551 * PEXT (BMI2)
3552 */
3553#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3554IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3555{ \
3556 a_uType uResult = 0; \
3557 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3558 if (fMask & ((a_uType)1 << iMaskBit)) \
3559 { \
3560 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3561 iBit++; \
3562 } \
3563 *puDst = uResult; \
3564}
3565
3566#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3567EMIT_PEXT(64, uint64_t, RT_NOTHING)
3568#endif
3569EMIT_PEXT(64, uint64_t, _fallback)
3570#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3571EMIT_PEXT(32, uint32_t, RT_NOTHING)
3572#endif
3573EMIT_PEXT(32, uint32_t, _fallback)
3574
3575
3576#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3577
3578# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3579/*
3580 * BSWAP
3581 */
3582
3583IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3584{
3585 *puDst = ASMByteSwapU64(*puDst);
3586}
3587
3588
3589IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3590{
3591 *puDst = ASMByteSwapU32(*puDst);
3592}
3593
3594
3595/* Note! undocument, so 32-bit arg */
3596IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3597{
3598#if 0
3599 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3600#else
3601 /* This is the behaviour AMD 3990x (64-bit mode): */
3602 *(uint16_t *)puDst = 0;
3603#endif
3604}
3605
3606# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3607
3608
3609
3610# if defined(IEM_WITHOUT_ASSEMBLY)
3611
3612/*
3613 * LFENCE, SFENCE & MFENCE.
3614 */
3615
3616IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3617{
3618 ASMReadFence();
3619}
3620
3621
3622IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3623{
3624 ASMWriteFence();
3625}
3626
3627
3628IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3629{
3630 ASMMemoryFence();
3631}
3632
3633
3634# ifndef RT_ARCH_ARM64
3635IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3636{
3637 ASMMemoryFence();
3638}
3639# endif
3640
3641# endif
3642
3643#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3644
3645
3646IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3647{
3648 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3649 {
3650 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3651 *pu16Dst |= u16Src & X86_SEL_RPL;
3652
3653 *pfEFlags |= X86_EFL_ZF;
3654 }
3655 else
3656 *pfEFlags &= ~X86_EFL_ZF;
3657}
3658
3659
3660#if defined(IEM_WITHOUT_ASSEMBLY)
3661
3662/*********************************************************************************************************************************
3663* x87 FPU Loads *
3664*********************************************************************************************************************************/
3665
3666IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3667{
3668 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3669 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3670 {
3671 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3672 pFpuRes->r80Result.sj64.fInteger = 1;
3673 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3674 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3675 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3676 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3677 }
3678 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3679 {
3680 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3681 pFpuRes->r80Result.s.uExponent = 0;
3682 pFpuRes->r80Result.s.uMantissa = 0;
3683 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3684 }
3685 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3686 {
3687 /* Subnormal values gets normalized. */
3688 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3689 pFpuRes->r80Result.sj64.fInteger = 1;
3690 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3691 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3692 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3693 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3694 pFpuRes->FSW |= X86_FSW_DE;
3695 if (!(pFpuState->FCW & X86_FCW_DM))
3696 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3697 }
3698 else if (RTFLOAT32U_IS_INF(pr32Val))
3699 {
3700 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3701 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3702 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3703 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3704 }
3705 else
3706 {
3707 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3708 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3709 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3710 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3711 pFpuRes->r80Result.sj64.fInteger = 1;
3712 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3713 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3714 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3715 {
3716 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3717 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3718 pFpuRes->FSW |= X86_FSW_IE;
3719
3720 if (!(pFpuState->FCW & X86_FCW_IM))
3721 {
3722 /* The value is not pushed. */
3723 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3724 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3725 pFpuRes->r80Result.au64[0] = 0;
3726 pFpuRes->r80Result.au16[4] = 0;
3727 }
3728 }
3729 else
3730 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3731 }
3732}
3733
3734
3735IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3736{
3737 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3738 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3739 {
3740 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3741 pFpuRes->r80Result.sj64.fInteger = 1;
3742 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3743 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3744 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3745 }
3746 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3747 {
3748 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3749 pFpuRes->r80Result.s.uExponent = 0;
3750 pFpuRes->r80Result.s.uMantissa = 0;
3751 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3752 }
3753 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3754 {
3755 /* Subnormal values gets normalized. */
3756 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3757 pFpuRes->r80Result.sj64.fInteger = 1;
3758 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3759 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3760 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3761 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3762 pFpuRes->FSW |= X86_FSW_DE;
3763 if (!(pFpuState->FCW & X86_FCW_DM))
3764 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3765 }
3766 else if (RTFLOAT64U_IS_INF(pr64Val))
3767 {
3768 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3769 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3770 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3771 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3772 }
3773 else
3774 {
3775 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3776 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3777 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3778 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3779 pFpuRes->r80Result.sj64.fInteger = 1;
3780 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3781 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3782 {
3783 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3784 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3785 pFpuRes->FSW |= X86_FSW_IE;
3786
3787 if (!(pFpuState->FCW & X86_FCW_IM))
3788 {
3789 /* The value is not pushed. */
3790 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3791 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3792 pFpuRes->r80Result.au64[0] = 0;
3793 pFpuRes->r80Result.au16[4] = 0;
3794 }
3795 }
3796 else
3797 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3798 }
3799}
3800
3801
3802IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3803{
3804 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3805 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3806 /* Raises no exceptions. */
3807 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3808}
3809
3810
3811IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3812{
3813 pFpuRes->r80Result.sj64.fSign = 0;
3814 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3815 pFpuRes->r80Result.sj64.fInteger = 1;
3816 pFpuRes->r80Result.sj64.uFraction = 0;
3817
3818 /*
3819 * FPU status word:
3820 * - TOP is irrelevant, but we must match x86 assembly version.
3821 * - C1 is always cleared as we don't have any stack overflows.
3822 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3823 */
3824 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3829{
3830 pFpuRes->r80Result.sj64.fSign = 0;
3831 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3832 pFpuRes->r80Result.sj64.fInteger = 1;
3833 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3834 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3835 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3836 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3837}
3838
3839
3840IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3841{
3842 pFpuRes->r80Result.sj64.fSign = 0;
3843 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3844 pFpuRes->r80Result.sj64.fInteger = 1;
3845 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3846 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3847 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3848}
3849
3850
3851IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3852{
3853 pFpuRes->r80Result.sj64.fSign = 0;
3854 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3855 pFpuRes->r80Result.sj64.fInteger = 1;
3856 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3857 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3858 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3859 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3860}
3861
3862
3863IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3864{
3865 pFpuRes->r80Result.sj64.fSign = 0;
3866 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3867 pFpuRes->r80Result.sj64.fInteger = 1;
3868 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3869 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3870 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3871 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3872}
3873
3874
3875IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3876{
3877 pFpuRes->r80Result.sj64.fSign = 0;
3878 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3879 pFpuRes->r80Result.sj64.fInteger = 1;
3880 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3881 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3882 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3883 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3884}
3885
3886
3887IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3888{
3889 pFpuRes->r80Result.s.fSign = 0;
3890 pFpuRes->r80Result.s.uExponent = 0;
3891 pFpuRes->r80Result.s.uMantissa = 0;
3892 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3893}
3894
3895#define EMIT_FILD(a_cBits) \
3896IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3897 int ## a_cBits ## _t const *piVal)) \
3898{ \
3899 int ## a_cBits ## _t iVal = *piVal; \
3900 if (iVal == 0) \
3901 { \
3902 pFpuRes->r80Result.s.fSign = 0; \
3903 pFpuRes->r80Result.s.uExponent = 0; \
3904 pFpuRes->r80Result.s.uMantissa = 0; \
3905 } \
3906 else \
3907 { \
3908 if (iVal > 0) \
3909 pFpuRes->r80Result.s.fSign = 0; \
3910 else \
3911 { \
3912 pFpuRes->r80Result.s.fSign = 1; \
3913 iVal = -iVal; \
3914 } \
3915 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3916 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3917 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3918 } \
3919 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3920}
3921EMIT_FILD(16)
3922EMIT_FILD(32)
3923EMIT_FILD(64)
3924
3925
3926IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3927{
3928 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3929 if ( pd80Val->s.abPairs[0] == 0
3930 && pd80Val->s.abPairs[1] == 0
3931 && pd80Val->s.abPairs[2] == 0
3932 && pd80Val->s.abPairs[3] == 0
3933 && pd80Val->s.abPairs[4] == 0
3934 && pd80Val->s.abPairs[5] == 0
3935 && pd80Val->s.abPairs[6] == 0
3936 && pd80Val->s.abPairs[7] == 0
3937 && pd80Val->s.abPairs[8] == 0)
3938 {
3939 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3940 pFpuRes->r80Result.s.uExponent = 0;
3941 pFpuRes->r80Result.s.uMantissa = 0;
3942 }
3943 else
3944 {
3945 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3946
3947 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3948 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3949 cPairs--;
3950
3951 uint64_t uVal = 0;
3952 uint64_t uFactor = 1;
3953 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3954 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3955 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3956
3957 unsigned const cBits = ASMBitLastSetU64(uVal);
3958 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3959 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3960 }
3961}
3962
3963
3964/*********************************************************************************************************************************
3965* x87 FPU Stores *
3966*********************************************************************************************************************************/
3967
3968/**
3969 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3970 *
3971 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3972 *
3973 * @returns Updated FPU status word value.
3974 * @param fSignIn Incoming sign indicator.
3975 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
3976 * @param iExponentIn Unbiased exponent.
3977 * @param fFcw The FPU control word.
3978 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
3979 * @param pr32Dst Where to return the output value, if one should be
3980 * returned.
3981 *
3982 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
3983 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
3984 */
3985static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
3986 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
3987{
3988 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
3989 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3990 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
3991 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
3992 ? fRoundingOffMask
3993 : 0;
3994 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
3995
3996 /*
3997 * Deal with potential overflows/underflows first, optimizing for none.
3998 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
3999 */
4000 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4001 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4002 { /* likely? */ }
4003 /*
4004 * Underflow if the exponent zero or negative. This is attempted mapped
4005 * to a subnormal number when possible, with some additional trickery ofc.
4006 */
4007 else if (iExponentOut <= 0)
4008 {
4009 bool const fIsTiny = iExponentOut < 0
4010 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4011 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4012 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4013 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4014
4015 if (iExponentOut <= 0)
4016 {
4017 uMantissaIn = iExponentOut <= -63
4018 ? uMantissaIn != 0
4019 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4020 fRoundedOff = uMantissaIn & fRoundingOffMask;
4021 if (fRoundedOff && fIsTiny)
4022 fFsw |= X86_FSW_UE;
4023 iExponentOut = 0;
4024 }
4025 }
4026 /*
4027 * Overflow if at or above max exponent value or if we will reach max
4028 * when rounding. Will return +/-zero or +/-max value depending on
4029 * whether we're rounding or not.
4030 */
4031 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4032 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4033 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4034 {
4035 fFsw |= X86_FSW_OE;
4036 if (!(fFcw & X86_FCW_OM))
4037 return fFsw | X86_FSW_ES | X86_FSW_B;
4038 fFsw |= X86_FSW_PE;
4039 if (uRoundingAdd)
4040 fFsw |= X86_FSW_C1;
4041 if (!(fFcw & X86_FCW_PM))
4042 fFsw |= X86_FSW_ES | X86_FSW_B;
4043
4044 pr32Dst->s.fSign = fSignIn;
4045 if (uRoundingAdd)
4046 { /* Zero */
4047 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4048 pr32Dst->s.uFraction = 0;
4049 }
4050 else
4051 { /* Max */
4052 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4053 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4054 }
4055 return fFsw;
4056 }
4057
4058 /*
4059 * Normal or subnormal number.
4060 */
4061 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4062 uint64_t uMantissaOut = uMantissaIn;
4063 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4064 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4065 || fRoundedOff != uRoundingAdd)
4066 {
4067 uMantissaOut = uMantissaIn + uRoundingAdd;
4068 if (uMantissaOut >= uMantissaIn)
4069 { /* likely */ }
4070 else
4071 {
4072 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4073 iExponentOut++;
4074 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4075 fFsw |= X86_FSW_C1;
4076 }
4077 }
4078 else
4079 uMantissaOut = uMantissaIn;
4080
4081 /* Truncate the mantissa and set the return value. */
4082 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4083
4084 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4085 pr32Dst->s.uExponent = iExponentOut;
4086 pr32Dst->s.fSign = fSignIn;
4087
4088 /* Set status flags realted to rounding. */
4089 if (fRoundedOff)
4090 {
4091 fFsw |= X86_FSW_PE;
4092 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4093 fFsw |= X86_FSW_C1;
4094 if (!(fFcw & X86_FCW_PM))
4095 fFsw |= X86_FSW_ES | X86_FSW_B;
4096 }
4097
4098 return fFsw;
4099}
4100
4101
4102/**
4103 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4104 */
4105IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4106 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4107{
4108 uint16_t const fFcw = pFpuState->FCW;
4109 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4110 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4111 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4112 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4113 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4114 {
4115 pr32Dst->s.fSign = pr80Src->s.fSign;
4116 pr32Dst->s.uExponent = 0;
4117 pr32Dst->s.uFraction = 0;
4118 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4119 }
4120 else if (RTFLOAT80U_IS_INF(pr80Src))
4121 {
4122 pr32Dst->s.fSign = pr80Src->s.fSign;
4123 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4124 pr32Dst->s.uFraction = 0;
4125 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4126 }
4127 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4128 {
4129 /* Mapped to +/-QNaN */
4130 pr32Dst->s.fSign = pr80Src->s.fSign;
4131 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4132 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4133 }
4134 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4135 {
4136 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4137 if (fFcw & X86_FCW_IM)
4138 {
4139 pr32Dst->s.fSign = 1;
4140 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4141 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4142 fFsw |= X86_FSW_IE;
4143 }
4144 else
4145 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4146 }
4147 else if (RTFLOAT80U_IS_NAN(pr80Src))
4148 {
4149 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4150 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4151 {
4152 pr32Dst->s.fSign = pr80Src->s.fSign;
4153 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4154 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4155 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4156 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4157 fFsw |= X86_FSW_IE;
4158 }
4159 else
4160 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4161 }
4162 else
4163 {
4164 /* Denormal values causes both an underflow and precision exception. */
4165 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4166 if (fFcw & X86_FCW_UM)
4167 {
4168 pr32Dst->s.fSign = pr80Src->s.fSign;
4169 pr32Dst->s.uExponent = 0;
4170 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4171 {
4172 pr32Dst->s.uFraction = 1;
4173 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4174 if (!(fFcw & X86_FCW_PM))
4175 fFsw |= X86_FSW_ES | X86_FSW_B;
4176 }
4177 else
4178 {
4179 pr32Dst->s.uFraction = 0;
4180 fFsw |= X86_FSW_UE | X86_FSW_PE;
4181 if (!(fFcw & X86_FCW_PM))
4182 fFsw |= X86_FSW_ES | X86_FSW_B;
4183 }
4184 }
4185 else
4186 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 *pu16FSW = fFsw;
4189}
4190
4191
4192/**
4193 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4194 *
4195 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4196 *
4197 * @returns Updated FPU status word value.
4198 * @param fSignIn Incoming sign indicator.
4199 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4200 * @param iExponentIn Unbiased exponent.
4201 * @param fFcw The FPU control word.
4202 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4203 * @param pr64Dst Where to return the output value, if one should be
4204 * returned.
4205 *
4206 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4207 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4208 */
4209static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4210 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4211{
4212 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4213 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4214 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4215 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4216 ? fRoundingOffMask
4217 : 0;
4218 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4219
4220 /*
4221 * Deal with potential overflows/underflows first, optimizing for none.
4222 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4223 */
4224 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4225 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4226 { /* likely? */ }
4227 /*
4228 * Underflow if the exponent zero or negative. This is attempted mapped
4229 * to a subnormal number when possible, with some additional trickery ofc.
4230 */
4231 else if (iExponentOut <= 0)
4232 {
4233 bool const fIsTiny = iExponentOut < 0
4234 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4235 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4236 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4237 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4238
4239 if (iExponentOut <= 0)
4240 {
4241 uMantissaIn = iExponentOut <= -63
4242 ? uMantissaIn != 0
4243 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4244 fRoundedOff = uMantissaIn & fRoundingOffMask;
4245 if (fRoundedOff && fIsTiny)
4246 fFsw |= X86_FSW_UE;
4247 iExponentOut = 0;
4248 }
4249 }
4250 /*
4251 * Overflow if at or above max exponent value or if we will reach max
4252 * when rounding. Will return +/-zero or +/-max value depending on
4253 * whether we're rounding or not.
4254 */
4255 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4256 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4257 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4258 {
4259 fFsw |= X86_FSW_OE;
4260 if (!(fFcw & X86_FCW_OM))
4261 return fFsw | X86_FSW_ES | X86_FSW_B;
4262 fFsw |= X86_FSW_PE;
4263 if (uRoundingAdd)
4264 fFsw |= X86_FSW_C1;
4265 if (!(fFcw & X86_FCW_PM))
4266 fFsw |= X86_FSW_ES | X86_FSW_B;
4267
4268 pr64Dst->s64.fSign = fSignIn;
4269 if (uRoundingAdd)
4270 { /* Zero */
4271 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4272 pr64Dst->s64.uFraction = 0;
4273 }
4274 else
4275 { /* Max */
4276 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4277 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4278 }
4279 return fFsw;
4280 }
4281
4282 /*
4283 * Normal or subnormal number.
4284 */
4285 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4286 uint64_t uMantissaOut = uMantissaIn;
4287 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4288 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4289 || fRoundedOff != uRoundingAdd)
4290 {
4291 uMantissaOut = uMantissaIn + uRoundingAdd;
4292 if (uMantissaOut >= uMantissaIn)
4293 { /* likely */ }
4294 else
4295 {
4296 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4297 iExponentOut++;
4298 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4299 fFsw |= X86_FSW_C1;
4300 }
4301 }
4302 else
4303 uMantissaOut = uMantissaIn;
4304
4305 /* Truncate the mantissa and set the return value. */
4306 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4307
4308 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4309 pr64Dst->s64.uExponent = iExponentOut;
4310 pr64Dst->s64.fSign = fSignIn;
4311
4312 /* Set status flags realted to rounding. */
4313 if (fRoundedOff)
4314 {
4315 fFsw |= X86_FSW_PE;
4316 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4317 fFsw |= X86_FSW_C1;
4318 if (!(fFcw & X86_FCW_PM))
4319 fFsw |= X86_FSW_ES | X86_FSW_B;
4320 }
4321
4322 return fFsw;
4323}
4324
4325
4326/**
4327 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4328 */
4329IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4330 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4331{
4332 uint16_t const fFcw = pFpuState->FCW;
4333 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4334 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4335 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4336 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4337 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4338 {
4339 pr64Dst->s64.fSign = pr80Src->s.fSign;
4340 pr64Dst->s64.uExponent = 0;
4341 pr64Dst->s64.uFraction = 0;
4342 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4343 }
4344 else if (RTFLOAT80U_IS_INF(pr80Src))
4345 {
4346 pr64Dst->s64.fSign = pr80Src->s.fSign;
4347 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4348 pr64Dst->s64.uFraction = 0;
4349 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4350 }
4351 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4352 {
4353 /* Mapped to +/-QNaN */
4354 pr64Dst->s64.fSign = pr80Src->s.fSign;
4355 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4356 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4357 }
4358 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4359 {
4360 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4361 if (fFcw & X86_FCW_IM)
4362 {
4363 pr64Dst->s64.fSign = 1;
4364 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4365 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4366 fFsw |= X86_FSW_IE;
4367 }
4368 else
4369 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4370 }
4371 else if (RTFLOAT80U_IS_NAN(pr80Src))
4372 {
4373 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4374 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4375 {
4376 pr64Dst->s64.fSign = pr80Src->s.fSign;
4377 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4378 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4379 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4380 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4381 fFsw |= X86_FSW_IE;
4382 }
4383 else
4384 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4385 }
4386 else
4387 {
4388 /* Denormal values causes both an underflow and precision exception. */
4389 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4390 if (fFcw & X86_FCW_UM)
4391 {
4392 pr64Dst->s64.fSign = pr80Src->s.fSign;
4393 pr64Dst->s64.uExponent = 0;
4394 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4395 {
4396 pr64Dst->s64.uFraction = 1;
4397 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4398 if (!(fFcw & X86_FCW_PM))
4399 fFsw |= X86_FSW_ES | X86_FSW_B;
4400 }
4401 else
4402 {
4403 pr64Dst->s64.uFraction = 0;
4404 fFsw |= X86_FSW_UE | X86_FSW_PE;
4405 if (!(fFcw & X86_FCW_PM))
4406 fFsw |= X86_FSW_ES | X86_FSW_B;
4407 }
4408 }
4409 else
4410 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 *pu16FSW = fFsw;
4413}
4414
4415
4416IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4417 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4418{
4419 /*
4420 * FPU status word:
4421 * - TOP is irrelevant, but we must match x86 assembly version (0).
4422 * - C1 is always cleared as we don't have any stack overflows.
4423 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4424 */
4425 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4426 *pr80Dst = *pr80Src;
4427}
4428
4429
4430/*
4431 *
4432 * Mantissa:
4433 * 63 56 48 40 32 24 16 8 0
4434 * v v v v v v v v v
4435 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4436 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4437 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4438 *
4439 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4440 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4441 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4442 * where we'll drop off all but bit 63.
4443 */
4444#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4445IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4446 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4447{ \
4448 uint16_t const fFcw = pFpuState->FCW; \
4449 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4450 bool const fSignIn = pr80Val->s.fSign; \
4451 \
4452 /* \
4453 * Deal with normal numbers first. \
4454 */ \
4455 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4456 { \
4457 uint64_t uMantissa = pr80Val->s.uMantissa; \
4458 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4459 \
4460 if ((uint32_t)iExponent <= a_cBits - 2) \
4461 { \
4462 unsigned const cShiftOff = 63 - iExponent; \
4463 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4464 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4465 ? RT_BIT_64(cShiftOff - 1) \
4466 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4467 ? fRoundingOffMask \
4468 : 0; \
4469 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4470 \
4471 uMantissa >>= cShiftOff; \
4472 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4473 uMantissa += uRounding; \
4474 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4475 { \
4476 if (fRoundedOff) \
4477 { \
4478 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4479 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4480 else if (uRounding) \
4481 fFsw |= X86_FSW_C1; \
4482 fFsw |= X86_FSW_PE; \
4483 if (!(fFcw & X86_FCW_PM)) \
4484 fFsw |= X86_FSW_ES | X86_FSW_B; \
4485 } \
4486 \
4487 if (!fSignIn) \
4488 *piDst = (a_iType)uMantissa; \
4489 else \
4490 *piDst = -(a_iType)uMantissa; \
4491 } \
4492 else \
4493 { \
4494 /* overflowed after rounding. */ \
4495 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4496 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4497 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4498 \
4499 /* Special case for the integer minimum value. */ \
4500 if (fSignIn) \
4501 { \
4502 *piDst = a_iTypeMin; \
4503 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4504 if (!(fFcw & X86_FCW_PM)) \
4505 fFsw |= X86_FSW_ES | X86_FSW_B; \
4506 } \
4507 else \
4508 { \
4509 fFsw |= X86_FSW_IE; \
4510 if (fFcw & X86_FCW_IM) \
4511 *piDst = a_iTypeMin; \
4512 else \
4513 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4514 } \
4515 } \
4516 } \
4517 /* \
4518 * Tiny sub-zero numbers. \
4519 */ \
4520 else if (iExponent < 0) \
4521 { \
4522 if (!fSignIn) \
4523 { \
4524 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4525 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4526 { \
4527 *piDst = 1; \
4528 fFsw |= X86_FSW_C1; \
4529 } \
4530 else \
4531 *piDst = 0; \
4532 } \
4533 else \
4534 { \
4535 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4536 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4537 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4538 *piDst = 0; \
4539 else \
4540 { \
4541 *piDst = -1; \
4542 fFsw |= X86_FSW_C1; \
4543 } \
4544 } \
4545 fFsw |= X86_FSW_PE; \
4546 if (!(fFcw & X86_FCW_PM)) \
4547 fFsw |= X86_FSW_ES | X86_FSW_B; \
4548 } \
4549 /* \
4550 * Special MIN case. \
4551 */ \
4552 else if ( fSignIn && iExponent == a_cBits - 1 \
4553 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4554 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4555 : uMantissa == RT_BIT_64(63))) \
4556 { \
4557 *piDst = a_iTypeMin; \
4558 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4559 { \
4560 fFsw |= X86_FSW_PE; \
4561 if (!(fFcw & X86_FCW_PM)) \
4562 fFsw |= X86_FSW_ES | X86_FSW_B; \
4563 } \
4564 } \
4565 /* \
4566 * Too large/small number outside the target integer range. \
4567 */ \
4568 else \
4569 { \
4570 fFsw |= X86_FSW_IE; \
4571 if (fFcw & X86_FCW_IM) \
4572 *piDst = a_iTypeIndefinite; \
4573 else \
4574 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4575 } \
4576 } \
4577 /* \
4578 * Map both +0 and -0 to integer zero (signless/+). \
4579 */ \
4580 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4581 *piDst = 0; \
4582 /* \
4583 * Denormals are just really tiny sub-zero numbers that are either rounded \
4584 * to zero, 1 or -1 depending on sign and rounding control. \
4585 */ \
4586 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4587 { \
4588 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4589 *piDst = 0; \
4590 else \
4591 { \
4592 *piDst = fSignIn ? -1 : 1; \
4593 fFsw |= X86_FSW_C1; \
4594 } \
4595 fFsw |= X86_FSW_PE; \
4596 if (!(fFcw & X86_FCW_PM)) \
4597 fFsw |= X86_FSW_ES | X86_FSW_B; \
4598 } \
4599 /* \
4600 * All other special values are considered invalid arguments and result \
4601 * in an IE exception and indefinite value if masked. \
4602 */ \
4603 else \
4604 { \
4605 fFsw |= X86_FSW_IE; \
4606 if (fFcw & X86_FCW_IM) \
4607 *piDst = a_iTypeIndefinite; \
4608 else \
4609 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4610 } \
4611 *pu16FSW = fFsw; \
4612}
4613EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4614EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4615EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4616
4617#endif /*IEM_WITHOUT_ASSEMBLY */
4618
4619
4620/*
4621 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4622 *
4623 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4624 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4625 * thus the @a a_cBitsIn.
4626 */
4627#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4628IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4629 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4630{ \
4631 uint16_t const fFcw = pFpuState->FCW; \
4632 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4633 bool const fSignIn = pr80Val->s.fSign; \
4634 \
4635 /* \
4636 * Deal with normal numbers first. \
4637 */ \
4638 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4639 { \
4640 uint64_t uMantissa = pr80Val->s.uMantissa; \
4641 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4642 \
4643 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4644 { \
4645 unsigned const cShiftOff = 63 - iExponent; \
4646 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4647 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4648 uMantissa >>= cShiftOff; \
4649 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4650 if (!fSignIn) \
4651 *piDst = (a_iType)uMantissa; \
4652 else \
4653 *piDst = -(a_iType)uMantissa; \
4654 \
4655 if (fRoundedOff) \
4656 { \
4657 fFsw |= X86_FSW_PE; \
4658 if (!(fFcw & X86_FCW_PM)) \
4659 fFsw |= X86_FSW_ES | X86_FSW_B; \
4660 } \
4661 } \
4662 /* \
4663 * Tiny sub-zero numbers. \
4664 */ \
4665 else if (iExponent < 0) \
4666 { \
4667 *piDst = 0; \
4668 fFsw |= X86_FSW_PE; \
4669 if (!(fFcw & X86_FCW_PM)) \
4670 fFsw |= X86_FSW_ES | X86_FSW_B; \
4671 } \
4672 /* \
4673 * Special MIN case. \
4674 */ \
4675 else if ( fSignIn && iExponent == a_cBits - 1 \
4676 && (a_cBits < 64 \
4677 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4678 : uMantissa == RT_BIT_64(63)) ) \
4679 { \
4680 *piDst = a_iTypeMin; \
4681 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Figure this weirdness. \
4690 */ \
4691 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4692 { \
4693 *piDst = 0; \
4694 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4695 { \
4696 fFsw |= X86_FSW_PE; \
4697 if (!(fFcw & X86_FCW_PM)) \
4698 fFsw |= X86_FSW_ES | X86_FSW_B; \
4699 } \
4700 } \
4701 /* \
4702 * Too large/small number outside the target integer range. \
4703 */ \
4704 else \
4705 { \
4706 fFsw |= X86_FSW_IE; \
4707 if (fFcw & X86_FCW_IM) \
4708 *piDst = a_iTypeIndefinite; \
4709 else \
4710 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4711 } \
4712 } \
4713 /* \
4714 * Map both +0 and -0 to integer zero (signless/+). \
4715 */ \
4716 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4717 *piDst = 0; \
4718 /* \
4719 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4720 */ \
4721 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4722 { \
4723 *piDst = 0; \
4724 fFsw |= X86_FSW_PE; \
4725 if (!(fFcw & X86_FCW_PM)) \
4726 fFsw |= X86_FSW_ES | X86_FSW_B; \
4727 } \
4728 /* \
4729 * All other special values are considered invalid arguments and result \
4730 * in an IE exception and indefinite value if masked. \
4731 */ \
4732 else \
4733 { \
4734 fFsw |= X86_FSW_IE; \
4735 if (fFcw & X86_FCW_IM) \
4736 *piDst = a_iTypeIndefinite; \
4737 else \
4738 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4739 } \
4740 *pu16FSW = fFsw; \
4741}
4742#if defined(IEM_WITHOUT_ASSEMBLY)
4743EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4744EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4745EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4746#endif
4747EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4748EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4749
4750
4751#if defined(IEM_WITHOUT_ASSEMBLY)
4752
4753IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4754 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4755{
4756 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4757 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4758 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4759 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4760 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4761
4762 uint16_t const fFcw = pFpuState->FCW;
4763 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4764 bool const fSignIn = pr80Src->s.fSign;
4765
4766 /*
4767 * Deal with normal numbers first.
4768 */
4769 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4770 {
4771 uint64_t uMantissa = pr80Src->s.uMantissa;
4772 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4773 if ( (uint32_t)iExponent <= 58
4774 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4775 {
4776 unsigned const cShiftOff = 63 - iExponent;
4777 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4778 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4779 ? RT_BIT_64(cShiftOff - 1)
4780 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4781 ? fRoundingOffMask
4782 : 0;
4783 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4784
4785 uMantissa >>= cShiftOff;
4786 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4787 uMantissa += uRounding;
4788 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4789 {
4790 if (fRoundedOff)
4791 {
4792 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4793 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4794 else if (uRounding)
4795 fFsw |= X86_FSW_C1;
4796 fFsw |= X86_FSW_PE;
4797 if (!(fFcw & X86_FCW_PM))
4798 fFsw |= X86_FSW_ES | X86_FSW_B;
4799 }
4800
4801 pd80Dst->s.fSign = fSignIn;
4802 pd80Dst->s.uPad = 0;
4803 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4804 {
4805 unsigned const uDigits = uMantissa % 100;
4806 uMantissa /= 100;
4807 uint8_t const bLo = uDigits % 10;
4808 uint8_t const bHi = uDigits / 10;
4809 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4810 }
4811 }
4812 else
4813 {
4814 /* overflowed after rounding. */
4815 fFsw |= X86_FSW_IE;
4816 if (fFcw & X86_FCW_IM)
4817 *pd80Dst = s_d80Indefinite;
4818 else
4819 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4820 }
4821 }
4822 /*
4823 * Tiny sub-zero numbers.
4824 */
4825 else if (iExponent < 0)
4826 {
4827 if (!fSignIn)
4828 {
4829 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4830 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4831 {
4832 *pd80Dst = s_ad80One[fSignIn];
4833 fFsw |= X86_FSW_C1;
4834 }
4835 else
4836 *pd80Dst = s_ad80Zeros[fSignIn];
4837 }
4838 else
4839 {
4840 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4841 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4842 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4843 *pd80Dst = s_ad80Zeros[fSignIn];
4844 else
4845 {
4846 *pd80Dst = s_ad80One[fSignIn];
4847 fFsw |= X86_FSW_C1;
4848 }
4849 }
4850 fFsw |= X86_FSW_PE;
4851 if (!(fFcw & X86_FCW_PM))
4852 fFsw |= X86_FSW_ES | X86_FSW_B;
4853 }
4854 /*
4855 * Too large/small number outside the target integer range.
4856 */
4857 else
4858 {
4859 fFsw |= X86_FSW_IE;
4860 if (fFcw & X86_FCW_IM)
4861 *pd80Dst = s_d80Indefinite;
4862 else
4863 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4864 }
4865 }
4866 /*
4867 * Map both +0 and -0 to integer zero (signless/+).
4868 */
4869 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4870 *pd80Dst = s_ad80Zeros[fSignIn];
4871 /*
4872 * Denormals are just really tiny sub-zero numbers that are either rounded
4873 * to zero, 1 or -1 depending on sign and rounding control.
4874 */
4875 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4876 {
4877 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4878 *pd80Dst = s_ad80Zeros[fSignIn];
4879 else
4880 {
4881 *pd80Dst = s_ad80One[fSignIn];
4882 fFsw |= X86_FSW_C1;
4883 }
4884 fFsw |= X86_FSW_PE;
4885 if (!(fFcw & X86_FCW_PM))
4886 fFsw |= X86_FSW_ES | X86_FSW_B;
4887 }
4888 /*
4889 * All other special values are considered invalid arguments and result
4890 * in an IE exception and indefinite value if masked.
4891 */
4892 else
4893 {
4894 fFsw |= X86_FSW_IE;
4895 if (fFcw & X86_FCW_IM)
4896 *pd80Dst = s_d80Indefinite;
4897 else
4898 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4899 }
4900 *pu16FSW = fFsw;
4901}
4902
4903
4904/*********************************************************************************************************************************
4905* FPU Helpers *
4906*********************************************************************************************************************************/
4907AssertCompileSize(RTFLOAT128U, 16);
4908AssertCompileSize(RTFLOAT80U, 10);
4909AssertCompileSize(RTFLOAT64U, 8);
4910AssertCompileSize(RTFLOAT32U, 4);
4911
4912/**
4913 * Normalizes a possible pseudo-normal value.
4914 *
4915 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4916 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4917 * i.e. changing uExponent from 0 to 1.
4918 *
4919 * This macro will declare a RTFLOAT80U with the name given by
4920 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4921 * a normalization was performed.
4922 *
4923 * @note This must be applied before calling SoftFloat with a value that couldbe
4924 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4925 * correctly.
4926 */
4927#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4928 RTFLOAT80U a_r80ValNormalized; \
4929 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4930 { \
4931 a_r80ValNormalized = *a_pr80Val; \
4932 a_r80ValNormalized.s.uExponent = 1; \
4933 a_pr80Val = &a_r80ValNormalized; \
4934 } else do {} while (0)
4935
4936#ifdef IEM_WITH_FLOAT128_FOR_FPU
4937
4938DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4939{
4940 int fNew;
4941 switch (fFcw & X86_FCW_RC_MASK)
4942 {
4943 default:
4944 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4945 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4946 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4947 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4948 }
4949 int fOld = fegetround();
4950 fesetround(fNew);
4951 return fOld;
4952}
4953
4954
4955DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4956{
4957 fesetround(fOld);
4958}
4959
4960DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4961{
4962 RT_NOREF(fFcw);
4963 RTFLOAT128U Tmp;
4964 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4965 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4966 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4967 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4968 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4969 {
4970 Assert(Tmp.s.uExponent == 0);
4971 Tmp.s2.uSignAndExponent++;
4972 }
4973 return *(_Float128 *)&Tmp;
4974}
4975
4976
4977DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
4978{
4979 RT_NOREF(fFcw);
4980 RTFLOAT128U Tmp;
4981 *(_Float128 *)&Tmp = rd128ValSrc;
4982 ASMCompilerBarrier();
4983 if (RTFLOAT128U_IS_NORMAL(&Tmp))
4984 {
4985 pr80Dst->s.fSign = Tmp.s64.fSign;
4986 pr80Dst->s.uExponent = Tmp.s64.uExponent;
4987 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
4988 | Tmp.s64.uFractionLo >> (64 - 15);
4989
4990 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4991 unsigned const cShiftOff = 64 - 15;
4992 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4993 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
4994 if (uRoundedOff)
4995 {
4996 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4997 ? RT_BIT_64(cShiftOff - 1)
4998 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4999 ? fRoundingOffMask
5000 : 0;
5001 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5002 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5003 || uRoundedOff != uRoundingAdd)
5004 {
5005 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5006 {
5007 uFraction += 1;
5008 if (!(uFraction & RT_BIT_64(63)))
5009 { /* likely */ }
5010 else
5011 {
5012 uFraction >>= 1;
5013 pr80Dst->s.uExponent++;
5014 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5015 return fFsw;
5016 }
5017 fFsw |= X86_FSW_C1;
5018 }
5019 }
5020 fFsw |= X86_FSW_PE;
5021 if (!(fFcw & X86_FCW_PM))
5022 fFsw |= X86_FSW_ES | X86_FSW_B;
5023 }
5024 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5025 }
5026 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5027 {
5028 pr80Dst->s.fSign = Tmp.s64.fSign;
5029 pr80Dst->s.uExponent = 0;
5030 pr80Dst->s.uMantissa = 0;
5031 }
5032 else if (RTFLOAT128U_IS_INF(&Tmp))
5033 {
5034 pr80Dst->s.fSign = Tmp.s64.fSign;
5035 pr80Dst->s.uExponent = 0;
5036 pr80Dst->s.uMantissa = 0;
5037 }
5038 return fFsw;
5039}
5040
5041
5042#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5043
5044/** Initializer for the SoftFloat state structure. */
5045# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5046 { \
5047 softfloat_tininess_afterRounding, \
5048 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5049 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5050 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5051 : (uint8_t)softfloat_round_minMag, \
5052 0, \
5053 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5054 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5055 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5056 }
5057
5058/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5059# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5060 ( (a_fFsw) \
5061 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5062 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5063 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5064 ? X86_FSW_ES | X86_FSW_B : 0) )
5065
5066
5067DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5068{
5069 RT_NOREF(fFcw);
5070 Assert(cBits > 64);
5071# if 0 /* rounding does not seem to help */
5072 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5073 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5074 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5075 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5076 {
5077 uint64_t uOld = r128.v[0];
5078 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5079 if (r128.v[0] < uOld)
5080 r128.v[1] += 1;
5081 }
5082# else
5083 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5084# endif
5085 return r128;
5086}
5087
5088
5089DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5090{
5091 RT_NOREF(fFcw);
5092 Assert(cBits > 64);
5093# if 0 /* rounding does not seem to help, not even on constants */
5094 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5095 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5096 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5097 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5098 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5099 {
5100 uint64_t uOld = r128.v[0];
5101 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5102 if (r128.v[0] < uOld)
5103 r128.v[1] += 1;
5104 }
5105 return r128;
5106# else
5107 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5108 return r128;
5109# endif
5110}
5111
5112
5113# if 0 /* unused */
5114DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5115{
5116 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5117 return r128;
5118}
5119# endif
5120
5121
5122/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5123DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5124{
5125 extFloat80_t Tmp;
5126 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5127 Tmp.signif = pr80Val->s2.uMantissa;
5128 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5129 return extF80_to_f128(Tmp, &Ignored);
5130}
5131
5132
5133/**
5134 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5135 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5136 *
5137 * This is only a structure format conversion, nothing else.
5138 */
5139DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5140{
5141 extFloat80_t Tmp;
5142 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5143 Tmp.signif = pr80Val->s2.uMantissa;
5144 return Tmp;
5145}
5146
5147
5148/**
5149 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5150 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5151 *
5152 * This is only a structure format conversion, nothing else.
5153 */
5154DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5155{
5156 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5157 pr80Dst->s2.uMantissa = r80XSrc.signif;
5158 return pr80Dst;
5159}
5160
5161
5162DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5163{
5164 RT_NOREF(fFcw);
5165 RTFLOAT128U Tmp;
5166 *(float128_t *)&Tmp = r128Src;
5167 ASMCompilerBarrier();
5168
5169 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5170 {
5171 pr80Dst->s.fSign = Tmp.s64.fSign;
5172 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5173 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5174 | Tmp.s64.uFractionLo >> (64 - 15);
5175
5176 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5177 unsigned const cShiftOff = 64 - 15;
5178 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5179 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5180 if (uRoundedOff)
5181 {
5182 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5183 ? RT_BIT_64(cShiftOff - 1)
5184 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5185 ? fRoundingOffMask
5186 : 0;
5187 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5188 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5189 || uRoundedOff != uRoundingAdd)
5190 {
5191 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5192 {
5193 uFraction += 1;
5194 if (!(uFraction & RT_BIT_64(63)))
5195 { /* likely */ }
5196 else
5197 {
5198 uFraction >>= 1;
5199 pr80Dst->s.uExponent++;
5200 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5201 return fFsw;
5202 }
5203 fFsw |= X86_FSW_C1;
5204 }
5205 }
5206 fFsw |= X86_FSW_PE;
5207 if (!(fFcw & X86_FCW_PM))
5208 fFsw |= X86_FSW_ES | X86_FSW_B;
5209 }
5210
5211 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5212 }
5213 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5214 {
5215 pr80Dst->s.fSign = Tmp.s64.fSign;
5216 pr80Dst->s.uExponent = 0;
5217 pr80Dst->s.uMantissa = 0;
5218 }
5219 else if (RTFLOAT128U_IS_INF(&Tmp))
5220 {
5221 pr80Dst->s.fSign = Tmp.s64.fSign;
5222 pr80Dst->s.uExponent = 0;
5223 pr80Dst->s.uMantissa = 0;
5224 }
5225 return fFsw;
5226}
5227
5228
5229/**
5230 * Helper for transfering exception and C1 to FSW and setting the result value
5231 * accordingly.
5232 *
5233 * @returns Updated FSW.
5234 * @param pSoftState The SoftFloat state following the operation.
5235 * @param r80XResult The result of the SoftFloat operation.
5236 * @param pr80Result Where to store the result for IEM.
5237 * @param fFcw The FPU control word.
5238 * @param fFsw The FSW before the operation, with necessary bits
5239 * cleared and such.
5240 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5241 * raised.
5242 */
5243DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5244 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5245 PCRTFLOAT80U pr80XcptResult)
5246{
5247 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5248 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5249 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5250 fFsw |= X86_FSW_ES | X86_FSW_B;
5251
5252 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5253 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5254 else
5255 {
5256 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5257 *pr80Result = *pr80XcptResult;
5258 }
5259 return fFsw;
5260}
5261
5262
5263/**
5264 * Helper doing polynomial evaluation using Horner's method.
5265 *
5266 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5267 */
5268float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5269 unsigned cPrecision, softfloat_state_t *pSoftState)
5270{
5271 Assert(cHornerConsts > 1);
5272 size_t i = cHornerConsts - 1;
5273 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5274 while (i-- > 0)
5275 {
5276 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5277 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5278 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5279 }
5280 return r128Result;
5281}
5282
5283#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5284
5285
5286/**
5287 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5288 * mantissa, exponent and sign.
5289 *
5290 * @returns Updated FSW.
5291 * @param pr80Dst Where to return the composed value.
5292 * @param fSign The sign.
5293 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5294 * ignored and should be zero. This will probably be
5295 * modified during normalization and rounding.
5296 * @param iExponent Unbiased exponent.
5297 * @param fFcw The FPU control word.
5298 * @param fFsw The FPU status word.
5299 */
5300static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5301 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5302{
5303 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5304
5305 iExponent += RTFLOAT80U_EXP_BIAS;
5306
5307 /* Do normalization if necessary and possible. */
5308 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5309 {
5310 int cShift = 192 - RTUInt256BitCount(puMantissa);
5311 if (iExponent > cShift)
5312 iExponent -= cShift;
5313 else
5314 {
5315 if (fFcw & X86_FCW_UM)
5316 {
5317 if (iExponent > 0)
5318 cShift = --iExponent;
5319 else
5320 cShift = 0;
5321 }
5322 iExponent -= cShift;
5323 }
5324 RTUInt256AssignShiftLeft(puMantissa, cShift);
5325 }
5326
5327 /* Do rounding. */
5328 uint64_t uMantissa = puMantissa->QWords.qw2;
5329 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5330 {
5331 bool fAdd;
5332 switch (fFcw & X86_FCW_RC_MASK)
5333 {
5334 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5335 case X86_FCW_RC_NEAREST:
5336 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5337 {
5338 if ( (uMantissa & 1)
5339 || puMantissa->QWords.qw0 != 0
5340 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5341 {
5342 fAdd = true;
5343 break;
5344 }
5345 uMantissa &= ~(uint64_t)1;
5346 }
5347 fAdd = false;
5348 break;
5349 case X86_FCW_RC_ZERO:
5350 fAdd = false;
5351 break;
5352 case X86_FCW_RC_UP:
5353 fAdd = !fSign;
5354 break;
5355 case X86_FCW_RC_DOWN:
5356 fAdd = fSign;
5357 break;
5358 }
5359 if (fAdd)
5360 {
5361 uint64_t const uTmp = uMantissa;
5362 uMantissa = uTmp + 1;
5363 if (uMantissa < uTmp)
5364 {
5365 uMantissa >>= 1;
5366 uMantissa |= RT_BIT_64(63);
5367 iExponent++;
5368 }
5369 fFsw |= X86_FSW_C1;
5370 }
5371 fFsw |= X86_FSW_PE;
5372 if (!(fFcw & X86_FCW_PM))
5373 fFsw |= X86_FSW_ES | X86_FSW_B;
5374 }
5375
5376 /* Check for underflow (denormals). */
5377 if (iExponent <= 0)
5378 {
5379 if (fFcw & X86_FCW_UM)
5380 {
5381 if (uMantissa & RT_BIT_64(63))
5382 uMantissa >>= 1;
5383 iExponent = 0;
5384 }
5385 else
5386 {
5387 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5388 fFsw |= X86_FSW_ES | X86_FSW_B;
5389 }
5390 fFsw |= X86_FSW_UE;
5391 }
5392 /* Check for overflow */
5393 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5394 {
5395 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5396 }
5397
5398 /* Compose the result. */
5399 pr80Dst->s.uMantissa = uMantissa;
5400 pr80Dst->s.uExponent = iExponent;
5401 pr80Dst->s.fSign = fSign;
5402 return fFsw;
5403}
5404
5405
5406/**
5407 * See also iemAImpl_fld_r80_from_r32
5408 */
5409static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5410{
5411 uint16_t fFsw = 0;
5412 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5413 {
5414 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5415 pr80Dst->sj64.fInteger = 1;
5416 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5417 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5418 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5419 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5420 }
5421 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5422 {
5423 pr80Dst->s.fSign = pr32Val->s.fSign;
5424 pr80Dst->s.uExponent = 0;
5425 pr80Dst->s.uMantissa = 0;
5426 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5427 }
5428 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5429 {
5430 /* Subnormal -> normalized + X86_FSW_DE return. */
5431 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5432 pr80Dst->sj64.fInteger = 1;
5433 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5434 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5435 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5436 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5437 fFsw = X86_FSW_DE;
5438 }
5439 else if (RTFLOAT32U_IS_INF(pr32Val))
5440 {
5441 pr80Dst->s.fSign = pr32Val->s.fSign;
5442 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5443 pr80Dst->s.uMantissa = RT_BIT_64(63);
5444 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5445 }
5446 else
5447 {
5448 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5449 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5450 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5451 pr80Dst->sj64.fInteger = 1;
5452 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5453 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5454 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5455 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5456 }
5457 return fFsw;
5458}
5459
5460
5461/**
5462 * See also iemAImpl_fld_r80_from_r64
5463 */
5464static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5465{
5466 uint16_t fFsw = 0;
5467 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5468 {
5469 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5470 pr80Dst->sj64.fInteger = 1;
5471 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5472 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5473 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5474 }
5475 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5476 {
5477 pr80Dst->s.fSign = pr64Val->s.fSign;
5478 pr80Dst->s.uExponent = 0;
5479 pr80Dst->s.uMantissa = 0;
5480 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5481 }
5482 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5483 {
5484 /* Subnormal values gets normalized. */
5485 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5486 pr80Dst->sj64.fInteger = 1;
5487 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5488 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5489 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5490 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5491 fFsw = X86_FSW_DE;
5492 }
5493 else if (RTFLOAT64U_IS_INF(pr64Val))
5494 {
5495 pr80Dst->s.fSign = pr64Val->s.fSign;
5496 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5497 pr80Dst->s.uMantissa = RT_BIT_64(63);
5498 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5499 }
5500 else
5501 {
5502 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5503 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5504 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5505 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5506 pr80Dst->sj64.fInteger = 1;
5507 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5508 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5509 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5510 }
5511 return fFsw;
5512}
5513
5514
5515/**
5516 * See also EMIT_FILD.
5517 */
5518#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5519static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5520{ \
5521 if (iVal == 0) \
5522 { \
5523 pr80Dst->s.fSign = 0; \
5524 pr80Dst->s.uExponent = 0; \
5525 pr80Dst->s.uMantissa = 0; \
5526 } \
5527 else \
5528 { \
5529 if (iVal > 0) \
5530 pr80Dst->s.fSign = 0; \
5531 else \
5532 { \
5533 pr80Dst->s.fSign = 1; \
5534 iVal = -iVal; \
5535 } \
5536 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5537 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5538 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5539 } \
5540 return pr80Dst; \
5541}
5542EMIT_CONVERT_IXX_TO_R80(16)
5543EMIT_CONVERT_IXX_TO_R80(32)
5544//EMIT_CONVERT_IXX_TO_R80(64)
5545
5546/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5547#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5548IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5549{ \
5550 RTFLOAT80U r80Val2; \
5551 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5552 Assert(!fFsw || fFsw == X86_FSW_DE); \
5553 if (fFsw) \
5554 { \
5555 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5556 fFsw = 0; \
5557 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5558 { \
5559 pFpuRes->r80Result = *pr80Val1; \
5560 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5561 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5562 return; \
5563 } \
5564 } \
5565 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5566 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5567}
5568
5569/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5570#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5571IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5572{ \
5573 RTFLOAT80U r80Val2; \
5574 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5575 Assert(!fFsw || fFsw == X86_FSW_DE); \
5576 if (fFsw) \
5577 { \
5578 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5579 fFsw = 0; \
5580 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5581 { \
5582 pFpuRes->r80Result = *pr80Val1; \
5583 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5584 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5585 return; \
5586 } \
5587 } \
5588 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5589 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5590}
5591
5592/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5593#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5594IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5595{ \
5596 RTFLOAT80U r80Val2; \
5597 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5598 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5599}
5600
5601/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5602#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5603IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5604{ \
5605 RTFLOAT80U r80Val2; \
5606 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5607 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5608}
5609
5610
5611
5612/*********************************************************************************************************************************
5613* x86 FPU Division Operations *
5614*********************************************************************************************************************************/
5615
5616/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5617static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5618 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5619{
5620 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5621 {
5622 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5623 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5624 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5625 }
5626 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5627 { /* Div by zero. */
5628 if (fFcw & X86_FCW_ZM)
5629 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5630 else
5631 {
5632 *pr80Result = *pr80Val1Org;
5633 fFsw |= X86_FSW_ES | X86_FSW_B;
5634 }
5635 fFsw |= X86_FSW_ZE;
5636 }
5637 else
5638 { /* Invalid operand */
5639 if (fFcw & X86_FCW_IM)
5640 *pr80Result = g_r80Indefinite;
5641 else
5642 {
5643 *pr80Result = *pr80Val1Org;
5644 fFsw |= X86_FSW_ES | X86_FSW_B;
5645 }
5646 fFsw |= X86_FSW_IE;
5647 }
5648 return fFsw;
5649}
5650
5651
5652IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5653 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5654{
5655 uint16_t const fFcw = pFpuState->FCW;
5656 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5657
5658 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5659 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5660 {
5661 if (fFcw & X86_FCW_IM)
5662 pFpuRes->r80Result = g_r80Indefinite;
5663 else
5664 {
5665 pFpuRes->r80Result = *pr80Val1;
5666 fFsw |= X86_FSW_ES | X86_FSW_B;
5667 }
5668 fFsw |= X86_FSW_IE;
5669 }
5670 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5671 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5672 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5673 {
5674 if (fFcw & X86_FCW_DM)
5675 {
5676 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5677 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5678 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5679 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5680 }
5681 else
5682 {
5683 pFpuRes->r80Result = *pr80Val1;
5684 fFsw |= X86_FSW_ES | X86_FSW_B;
5685 }
5686 fFsw |= X86_FSW_DE;
5687 }
5688 /* SoftFloat can handle the rest: */
5689 else
5690 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5691
5692 pFpuRes->FSW = fFsw;
5693}
5694
5695
5696EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5697EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5698EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5699EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5700
5701
5702IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5703 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5704{
5705 uint16_t const fFcw = pFpuState->FCW;
5706 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5707
5708 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5709 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5710 {
5711 if (fFcw & X86_FCW_IM)
5712 pFpuRes->r80Result = g_r80Indefinite;
5713 else
5714 {
5715 pFpuRes->r80Result = *pr80Val1;
5716 fFsw |= X86_FSW_ES | X86_FSW_B;
5717 }
5718 fFsw |= X86_FSW_IE;
5719 }
5720 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5721 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5722 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5723 {
5724 if (fFcw & X86_FCW_DM)
5725 {
5726 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5727 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5728 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5729 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5730 }
5731 else
5732 {
5733 pFpuRes->r80Result = *pr80Val1;
5734 fFsw |= X86_FSW_ES | X86_FSW_B;
5735 }
5736 fFsw |= X86_FSW_DE;
5737 }
5738 /* SoftFloat can handle the rest: */
5739 else
5740 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5741
5742 pFpuRes->FSW = fFsw;
5743}
5744
5745
5746EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5747EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5748EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5749EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5750
5751
5752/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5753static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5754 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5755{
5756 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5757 {
5758 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5759 uint16_t fCxFlags = 0;
5760 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5761 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5762 &fCxFlags, &SoftState);
5763 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5764 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5765 if ( !(fFsw & X86_FSW_IE)
5766 && !RTFLOAT80U_IS_NAN(pr80Result)
5767 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5768 {
5769 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5770 fFsw |= fCxFlags & X86_FSW_C_MASK;
5771 }
5772 return fFsw;
5773 }
5774
5775 /* Invalid operand */
5776 if (fFcw & X86_FCW_IM)
5777 *pr80Result = g_r80Indefinite;
5778 else
5779 {
5780 *pr80Result = *pr80Val1Org;
5781 fFsw |= X86_FSW_ES | X86_FSW_B;
5782 }
5783 return fFsw | X86_FSW_IE;
5784}
5785
5786
5787static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5788 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5789{
5790 uint16_t const fFcw = pFpuState->FCW;
5791 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5792
5793 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5794 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5795 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5796 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5797 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5798 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5799 {
5800 if (fFcw & X86_FCW_IM)
5801 pFpuRes->r80Result = g_r80Indefinite;
5802 else
5803 {
5804 pFpuRes->r80Result = *pr80Val1;
5805 fFsw |= X86_FSW_ES | X86_FSW_B;
5806 }
5807 fFsw |= X86_FSW_IE;
5808 }
5809 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5810 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5811 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5812 {
5813 if (fFcw & X86_FCW_DM)
5814 {
5815 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5816 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5817 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5818 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5819 pr80Val1Org, fLegacyInstr);
5820 }
5821 else
5822 {
5823 pFpuRes->r80Result = *pr80Val1;
5824 fFsw |= X86_FSW_ES | X86_FSW_B;
5825 }
5826 fFsw |= X86_FSW_DE;
5827 }
5828 /* SoftFloat can handle the rest: */
5829 else
5830 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5831 pr80Val1, fLegacyInstr);
5832
5833 pFpuRes->FSW = fFsw;
5834}
5835
5836
5837IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5838 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5839{
5840 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5841}
5842
5843
5844IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5845 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5846{
5847 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5848}
5849
5850
5851/*********************************************************************************************************************************
5852* x87 FPU Multiplication Operations *
5853*********************************************************************************************************************************/
5854
5855/** Worker for iemAImpl_fmul_r80_by_r80. */
5856static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5857 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5858{
5859 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5860 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5861 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5862}
5863
5864
5865IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5866 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5867{
5868 uint16_t const fFcw = pFpuState->FCW;
5869 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5870
5871 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5872 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5873 {
5874 if (fFcw & X86_FCW_IM)
5875 pFpuRes->r80Result = g_r80Indefinite;
5876 else
5877 {
5878 pFpuRes->r80Result = *pr80Val1;
5879 fFsw |= X86_FSW_ES | X86_FSW_B;
5880 }
5881 fFsw |= X86_FSW_IE;
5882 }
5883 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5884 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5885 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5886 {
5887 if (fFcw & X86_FCW_DM)
5888 {
5889 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5890 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5891 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5892 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5893 }
5894 else
5895 {
5896 pFpuRes->r80Result = *pr80Val1;
5897 fFsw |= X86_FSW_ES | X86_FSW_B;
5898 }
5899 fFsw |= X86_FSW_DE;
5900 }
5901 /* SoftFloat can handle the rest: */
5902 else
5903 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5904
5905 pFpuRes->FSW = fFsw;
5906}
5907
5908
5909EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5910EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5911EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5912EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5913
5914
5915/*********************************************************************************************************************************
5916* x87 FPU Addition *
5917*********************************************************************************************************************************/
5918
5919/** Worker for iemAImpl_fadd_r80_by_r80. */
5920static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5921 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5922{
5923 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5924 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5925 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5926}
5927
5928
5929IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5930 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5931{
5932 uint16_t const fFcw = pFpuState->FCW;
5933 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5934
5935 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5936 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5937 {
5938 if (fFcw & X86_FCW_IM)
5939 pFpuRes->r80Result = g_r80Indefinite;
5940 else
5941 {
5942 pFpuRes->r80Result = *pr80Val1;
5943 fFsw |= X86_FSW_ES | X86_FSW_B;
5944 }
5945 fFsw |= X86_FSW_IE;
5946 }
5947 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5948 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5949 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5950 {
5951 if (fFcw & X86_FCW_DM)
5952 {
5953 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5954 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5955 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5956 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5957 }
5958 else
5959 {
5960 pFpuRes->r80Result = *pr80Val1;
5961 fFsw |= X86_FSW_ES | X86_FSW_B;
5962 }
5963 fFsw |= X86_FSW_DE;
5964 }
5965 /* SoftFloat can handle the rest: */
5966 else
5967 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5968
5969 pFpuRes->FSW = fFsw;
5970}
5971
5972
5973EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
5974EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
5975EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
5976EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
5977
5978
5979/*********************************************************************************************************************************
5980* x87 FPU Subtraction *
5981*********************************************************************************************************************************/
5982
5983/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
5984static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5985 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5986{
5987 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5988 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5989 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5990}
5991
5992
5993IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5994 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5995{
5996 uint16_t const fFcw = pFpuState->FCW;
5997 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5998
5999 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6000 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6001 {
6002 if (fFcw & X86_FCW_IM)
6003 pFpuRes->r80Result = g_r80Indefinite;
6004 else
6005 {
6006 pFpuRes->r80Result = *pr80Val1;
6007 fFsw |= X86_FSW_ES | X86_FSW_B;
6008 }
6009 fFsw |= X86_FSW_IE;
6010 }
6011 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6012 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6013 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6014 {
6015 if (fFcw & X86_FCW_DM)
6016 {
6017 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6018 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6019 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6020 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6021 }
6022 else
6023 {
6024 pFpuRes->r80Result = *pr80Val1;
6025 fFsw |= X86_FSW_ES | X86_FSW_B;
6026 }
6027 fFsw |= X86_FSW_DE;
6028 }
6029 /* SoftFloat can handle the rest: */
6030 else
6031 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6032
6033 pFpuRes->FSW = fFsw;
6034}
6035
6036
6037EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6038EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6039EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6040EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6041
6042
6043/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6044IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6045 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6046{
6047 uint16_t const fFcw = pFpuState->FCW;
6048 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6049
6050 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6051 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6052 {
6053 if (fFcw & X86_FCW_IM)
6054 pFpuRes->r80Result = g_r80Indefinite;
6055 else
6056 {
6057 pFpuRes->r80Result = *pr80Val1;
6058 fFsw |= X86_FSW_ES | X86_FSW_B;
6059 }
6060 fFsw |= X86_FSW_IE;
6061 }
6062 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6063 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6064 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6065 {
6066 if (fFcw & X86_FCW_DM)
6067 {
6068 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6069 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6070 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6071 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6072 }
6073 else
6074 {
6075 pFpuRes->r80Result = *pr80Val1;
6076 fFsw |= X86_FSW_ES | X86_FSW_B;
6077 }
6078 fFsw |= X86_FSW_DE;
6079 }
6080 /* SoftFloat can handle the rest: */
6081 else
6082 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6083
6084 pFpuRes->FSW = fFsw;
6085}
6086
6087
6088EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6089EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6090EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6091EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6092
6093
6094/*********************************************************************************************************************************
6095* x87 FPU Trigometric Operations *
6096*********************************************************************************************************************************/
6097
6098
6099IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6100 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6101{
6102 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6103 AssertReleaseFailed();
6104}
6105
6106#endif /* IEM_WITHOUT_ASSEMBLY */
6107
6108IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6109 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6110{
6111 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6112}
6113
6114IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6115 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6116{
6117 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6118}
6119
6120
6121#if defined(IEM_WITHOUT_ASSEMBLY)
6122IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6123{
6124 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6125 AssertReleaseFailed();
6126}
6127#endif /* IEM_WITHOUT_ASSEMBLY */
6128
6129IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6130{
6131 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6132}
6133
6134IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6135{
6136 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6137}
6138
6139
6140#ifdef IEM_WITHOUT_ASSEMBLY
6141IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6142{
6143 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6144 AssertReleaseFailed();
6145}
6146#endif /* IEM_WITHOUT_ASSEMBLY */
6147
6148IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6149{
6150 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6151}
6152
6153IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6154{
6155 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6156}
6157
6158#ifdef IEM_WITHOUT_ASSEMBLY
6159IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6160{
6161 RT_NOREF(pFpuState, pFpuResTwo, pr80Val);
6162 AssertReleaseFailed();
6163}
6164#endif /* IEM_WITHOUT_ASSEMBLY */
6165
6166IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6167{
6168 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6169}
6170
6171IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6172{
6173 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6174}
6175
6176
6177#ifdef IEM_WITHOUT_ASSEMBLY
6178IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6179{
6180 RT_NOREF(pFpuState, pFpuRes, pr80Val);
6181 AssertReleaseFailed();
6182}
6183#endif /* IEM_WITHOUT_ASSEMBLY */
6184
6185IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6186{
6187 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6188}
6189
6190IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6191{
6192 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6193}
6194
6195#ifdef IEM_WITHOUT_ASSEMBLY
6196
6197
6198/*********************************************************************************************************************************
6199* x87 FPU Compare and Testing Operations *
6200*********************************************************************************************************************************/
6201
6202IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6203{
6204 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6205
6206 if (RTFLOAT80U_IS_ZERO(pr80Val))
6207 fFsw |= X86_FSW_C3;
6208 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6209 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6210 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6211 {
6212 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6213 if (!(pFpuState->FCW & X86_FCW_DM))
6214 fFsw |= X86_FSW_ES | X86_FSW_B;
6215 }
6216 else
6217 {
6218 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6219 if (!(pFpuState->FCW & X86_FCW_IM))
6220 fFsw |= X86_FSW_ES | X86_FSW_B;
6221 }
6222
6223 *pu16Fsw = fFsw;
6224}
6225
6226
6227IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6228{
6229 RT_NOREF(pFpuState);
6230 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6231
6232 /* C1 = sign bit (always, even if empty Intel says). */
6233 if (pr80Val->s.fSign)
6234 fFsw |= X86_FSW_C1;
6235
6236 /* Classify the value in C0, C2, C3. */
6237 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6238 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6239 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6240 fFsw |= X86_FSW_C2;
6241 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6242 fFsw |= X86_FSW_C3;
6243 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6244 fFsw |= X86_FSW_C0;
6245 else if (RTFLOAT80U_IS_INF(pr80Val))
6246 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6247 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6248 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6249 /* whatever else: 0 */
6250
6251 *pu16Fsw = fFsw;
6252}
6253
6254
6255/**
6256 * Worker for fcom, fucom, and friends.
6257 */
6258static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6259 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6260{
6261 /*
6262 * Unpack the values.
6263 */
6264 bool const fSign1 = pr80Val1->s.fSign;
6265 int32_t iExponent1 = pr80Val1->s.uExponent;
6266 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6267
6268 bool const fSign2 = pr80Val2->s.fSign;
6269 int32_t iExponent2 = pr80Val2->s.uExponent;
6270 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6271
6272 /*
6273 * Check for invalid inputs.
6274 */
6275 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6276 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6277 {
6278 if (!(fFcw & X86_FCW_IM))
6279 fFsw |= X86_FSW_ES | X86_FSW_B;
6280 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6281 }
6282
6283 /*
6284 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6285 */
6286 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6287 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6288 {
6289 if ( fIeOnAllNaNs
6290 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6291 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6292 {
6293 fFsw |= X86_FSW_IE;
6294 if (!(fFcw & X86_FCW_IM))
6295 fFsw |= X86_FSW_ES | X86_FSW_B;
6296 }
6297 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6298 }
6299
6300 /*
6301 * Normalize the values.
6302 */
6303 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6304 {
6305 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6306 iExponent1 = 1;
6307 else
6308 {
6309 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6310 uMantissa1 <<= iExponent1;
6311 iExponent1 = 1 - iExponent1;
6312 }
6313 fFsw |= X86_FSW_DE;
6314 if (!(fFcw & X86_FCW_DM))
6315 fFsw |= X86_FSW_ES | X86_FSW_B;
6316 }
6317
6318 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6319 {
6320 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6321 iExponent2 = 1;
6322 else
6323 {
6324 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6325 uMantissa2 <<= iExponent2;
6326 iExponent2 = 1 - iExponent2;
6327 }
6328 fFsw |= X86_FSW_DE;
6329 if (!(fFcw & X86_FCW_DM))
6330 fFsw |= X86_FSW_ES | X86_FSW_B;
6331 }
6332
6333 /*
6334 * Test if equal (val1 == val2):
6335 */
6336 if ( uMantissa1 == uMantissa2
6337 && iExponent1 == iExponent2
6338 && ( fSign1 == fSign2
6339 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6340 fFsw |= X86_FSW_C3;
6341 /*
6342 * Test if less than (val1 < val2):
6343 */
6344 else if (fSign1 && !fSign2)
6345 fFsw |= X86_FSW_C0;
6346 else if (fSign1 == fSign2)
6347 {
6348 /* Zeros are problematic, however at the most one can be zero here. */
6349 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6350 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6351 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6352 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6353
6354 if ( fSign1
6355 ^ ( iExponent1 < iExponent2
6356 || ( iExponent1 == iExponent2
6357 && uMantissa1 < uMantissa2 ) ) )
6358 fFsw |= X86_FSW_C0;
6359 }
6360 /* else: No flags set if greater. */
6361
6362 return fFsw;
6363}
6364
6365
6366IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6367 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6368{
6369 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6370}
6371
6372
6373
6374
6375IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6376 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6377{
6378 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6379}
6380
6381
6382IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6383 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6384{
6385 RTFLOAT80U r80Val2;
6386 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6387 Assert(!fFsw || fFsw == X86_FSW_DE);
6388 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6389 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6390 {
6391 if (!(pFpuState->FCW & X86_FCW_DM))
6392 fFsw |= X86_FSW_ES | X86_FSW_B;
6393 *pfFsw |= fFsw;
6394 }
6395}
6396
6397
6398IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6399 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6400{
6401 RTFLOAT80U r80Val2;
6402 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6403 Assert(!fFsw || fFsw == X86_FSW_DE);
6404 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6405 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6406 {
6407 if (!(pFpuState->FCW & X86_FCW_DM))
6408 fFsw |= X86_FSW_ES | X86_FSW_B;
6409 *pfFsw |= fFsw;
6410 }
6411}
6412
6413
6414IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6415 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6416{
6417 RTFLOAT80U r80Val2;
6418 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6419 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6420}
6421
6422
6423IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6424 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6425{
6426 RTFLOAT80U r80Val2;
6427 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6428 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6429}
6430
6431
6432/**
6433 * Worker for fcomi & fucomi.
6434 */
6435static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6436 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6437{
6438 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6439 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6440 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6441 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6442
6443 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6444 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6445 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6446}
6447
6448
6449IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6450 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6451{
6452 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6453}
6454
6455
6456IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6457 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6458{
6459 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6460}
6461
6462
6463/*********************************************************************************************************************************
6464* x87 FPU Other Operations *
6465*********************************************************************************************************************************/
6466
6467/**
6468 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6469 */
6470static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6471{
6472 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6473 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6474 true /*exact / generate #PE */, &SoftState));
6475 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6476}
6477
6478
6479IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6480{
6481 uint16_t const fFcw = pFpuState->FCW;
6482 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6483
6484 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6485 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6486 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6487 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6488 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6489 || RTFLOAT80U_IS_INF(pr80Val))
6490 pFpuRes->r80Result = *pr80Val;
6491 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6492 {
6493 fFsw |= X86_FSW_DE;
6494 if (fFcw & X86_FCW_DM)
6495 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6496 else
6497 {
6498 pFpuRes->r80Result = *pr80Val;
6499 fFsw |= X86_FSW_ES | X86_FSW_B;
6500 }
6501 }
6502 else
6503 {
6504 if (fFcw & X86_FCW_IM)
6505 {
6506 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6507 pFpuRes->r80Result = g_r80Indefinite;
6508 else
6509 {
6510 pFpuRes->r80Result = *pr80Val;
6511 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6512 }
6513 }
6514 else
6515 {
6516 pFpuRes->r80Result = *pr80Val;
6517 fFsw |= X86_FSW_ES | X86_FSW_B;
6518 }
6519 fFsw |= X86_FSW_IE;
6520 }
6521 pFpuRes->FSW = fFsw;
6522}
6523
6524
6525IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6526 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6527{
6528 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
6529 it does everything we need it to do. */
6530 uint16_t const fFcw = pFpuState->FCW;
6531 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6532 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6533 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6534 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6535}
6536
6537
6538/**
6539 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
6540 */
6541static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6542{
6543 Assert(!pr80Val->s.fSign);
6544 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6545 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
6546 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6547}
6548
6549
6550IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6551{
6552 uint16_t const fFcw = pFpuState->FCW;
6553 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6554
6555 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
6556 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6557 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6558 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6559 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6560 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
6561 pFpuRes->r80Result = *pr80Val;
6562 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
6563 {
6564 fFsw |= X86_FSW_DE;
6565 if (fFcw & X86_FCW_DM)
6566 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6567 else
6568 {
6569 pFpuRes->r80Result = *pr80Val;
6570 fFsw |= X86_FSW_ES | X86_FSW_B;
6571 }
6572 }
6573 else
6574 {
6575 if (fFcw & X86_FCW_IM)
6576 {
6577 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6578 pFpuRes->r80Result = g_r80Indefinite;
6579 else
6580 {
6581 pFpuRes->r80Result = *pr80Val;
6582 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6583 }
6584 }
6585 else
6586 {
6587 pFpuRes->r80Result = *pr80Val;
6588 fFsw |= X86_FSW_ES | X86_FSW_B;
6589 }
6590 fFsw |= X86_FSW_IE;
6591 }
6592 pFpuRes->FSW = fFsw;
6593}
6594
6595
6596/**
6597 * @code{.unparsed}
6598 * x x * ln2
6599 * f(x) = 2 - 1 = e - 1
6600 *
6601 * @endcode
6602 *
6603 * We can approximate e^x by a Taylor/Maclaurin series (see
6604 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
6605 * @code{.unparsed}
6606 * n 0 1 2 3 4
6607 * inf x x x x x x
6608 * SUM ----- = --- + --- + --- + --- + --- + ...
6609 * n=0 n! 0! 1! 2! 3! 4!
6610 *
6611 * 2 3 4
6612 * x x x
6613 * = 1 + x + --- + --- + --- + ...
6614 * 2! 3! 4!
6615 * @endcode
6616 *
6617 * Given z = x * ln2, we get:
6618 * @code{.unparsed}
6619 * 2 3 4 n
6620 * z z z z z
6621 * e - 1 = z + --- + --- + --- + ... + ---
6622 * 2! 3! 4! n!
6623 * @endcode
6624 *
6625 * Wanting to use Horner's method, we move one z outside and get:
6626 * @code{.unparsed}
6627 * 2 3 (n-1)
6628 * z z z z
6629 * = z ( 1 + --- + --- + --- + ... + ------- )
6630 * 2! 3! 4! n!
6631 * @endcode
6632 *
6633 * The constants we need for using Horner's methods are 1 and 1 / n!.
6634 *
6635 * For very tiny x values, we can get away with f(x) = x * ln 2, because
6636 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
6637 * and can approximate it to be 1.0. For a visual demonstration of this
6638 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
6639 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
6640 *
6641 *
6642 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
6643 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
6644 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
6645 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
6646 * blocks). (The one bit difference is probably an implicit one missing from
6647 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
6648 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
6649 * exponent.
6650 *
6651 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
6652 * successfully reproduced the exact results from an Intel 10980XE, there is
6653 * always a portition of rounding differences. Not going to spend too much time
6654 * on getting this 100% the same, at least not now.
6655 *
6656 * P.S. If someone are really curious about 8087 and its contstants:
6657 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
6658 *
6659 *
6660 * @param pr80Val The exponent value (x), less than 1.0, greater than
6661 * -1.0 and not zero. This can be a normal, denormal
6662 * or pseudo-denormal value.
6663 * @param pr80Result Where to return the result.
6664 * @param fFcw FPU control word.
6665 * @param fFsw FPU status word.
6666 */
6667static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6668{
6669 /* As mentioned above, we can skip the expensive polynomial calculation
6670 as it will be close enough to 1.0 that it makes no difference.
6671
6672 The cutoff point for intel 10980XE is exponents >= -69. Intel
6673 also seems to be using a 67-bit or 68-bit constant value, and we get
6674 a smattering of rounding differences if we go for higher precision. */
6675 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
6676 {
6677 RTUINT256U u256;
6678 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
6679 u256.QWords.qw0 |= 1; /* force #PE */
6680 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
6681 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
6682 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
6683 : 1 - RTFLOAT80U_EXP_BIAS,
6684 fFcw, fFsw);
6685 }
6686 else
6687 {
6688#ifdef IEM_WITH_FLOAT128_FOR_FPU
6689 /* This approach is not good enough for small values, we end up with zero. */
6690 int const fOldRounding = iemFpuF128SetRounding(fFcw);
6691 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
6692 _Float128 rd128Result = powf128(2.0L, rd128Val);
6693 rd128Result -= 1.0L;
6694 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
6695 iemFpuF128RestoreRounding(fOldRounding);
6696
6697# else
6698 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6699 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
6700
6701 /* As mentioned above, enforce 68-bit internal mantissa width to better
6702 match the Intel 10980XE results. */
6703 unsigned const cPrecision = 68;
6704
6705 /* first calculate z = x * ln2 */
6706 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
6707 cPrecision);
6708
6709 /* Then do the polynomial evaluation. */
6710 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
6711 cPrecision, &SoftState);
6712 r = f128_mul(z, r, &SoftState);
6713
6714 /* Output the result. */
6715 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
6716# endif
6717 }
6718 return fFsw;
6719}
6720
6721
6722IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6723{
6724 uint16_t const fFcw = pFpuState->FCW;
6725 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6726
6727 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6728 {
6729 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
6730 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6731 else
6732 {
6733 /* Special case:
6734 2^+1.0 - 1.0 = 1.0
6735 2^-1.0 - 1.0 = -0.5 */
6736 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
6737 && pr80Val->s.uMantissa == RT_BIT_64(63))
6738 {
6739 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
6740 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
6741 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6742 }
6743 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
6744 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
6745 else
6746 pFpuRes->r80Result = *pr80Val;
6747 fFsw |= X86_FSW_PE;
6748 if (!(fFcw & X86_FCW_PM))
6749 fFsw |= X86_FSW_ES | X86_FSW_B;
6750 }
6751 }
6752 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6753 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6754 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6755 pFpuRes->r80Result = *pr80Val;
6756 else if (RTFLOAT80U_IS_INF(pr80Val))
6757 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
6758 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6759 {
6760 fFsw |= X86_FSW_DE;
6761 if (fFcw & X86_FCW_DM)
6762 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6763 else
6764 {
6765 pFpuRes->r80Result = *pr80Val;
6766 fFsw |= X86_FSW_ES | X86_FSW_B;
6767 }
6768 }
6769 else
6770 {
6771 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6772 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6773 && (fFcw & X86_FCW_IM))
6774 pFpuRes->r80Result = g_r80Indefinite;
6775 else
6776 {
6777 pFpuRes->r80Result = *pr80Val;
6778 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6779 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6780 }
6781 fFsw |= X86_FSW_IE;
6782 if (!(fFcw & X86_FCW_IM))
6783 fFsw |= X86_FSW_ES | X86_FSW_B;
6784 }
6785 pFpuRes->FSW = fFsw;
6786}
6787
6788#endif /* IEM_WITHOUT_ASSEMBLY */
6789
6790IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6791{
6792 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6793}
6794
6795IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6796{
6797 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
6798}
6799
6800#ifdef IEM_WITHOUT_ASSEMBLY
6801
6802IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6803{
6804 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6805 pFpuRes->r80Result = *pr80Val;
6806 pFpuRes->r80Result.s.fSign = 0;
6807}
6808
6809
6810IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6811{
6812 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6813 pFpuRes->r80Result = *pr80Val;
6814 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
6815}
6816
6817
6818IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6819{
6820 uint16_t const fFcw = pFpuState->FCW;
6821 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6822
6823 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6824 {
6825 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6826 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
6827
6828 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6829 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6830 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6831 }
6832 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6833 {
6834 fFsw |= X86_FSW_ZE;
6835 if (fFcw & X86_FCW_ZM)
6836 {
6837 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
6838 pFpuResTwo->r80Result2 = *pr80Val;
6839 }
6840 else
6841 {
6842 pFpuResTwo->r80Result2 = *pr80Val;
6843 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6844 }
6845 }
6846 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6847 {
6848 fFsw |= X86_FSW_DE;
6849 if (fFcw & X86_FCW_DM)
6850 {
6851 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
6852 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
6853 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
6854 int32_t iExponent = -16382;
6855 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
6856 {
6857 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
6858 iExponent--;
6859 }
6860
6861 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
6862 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
6863 }
6864 else
6865 {
6866 pFpuResTwo->r80Result2 = *pr80Val;
6867 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6868 }
6869 }
6870 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6871 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6872 {
6873 pFpuResTwo->r80Result1 = *pr80Val;
6874 pFpuResTwo->r80Result2 = *pr80Val;
6875 }
6876 else if (RTFLOAT80U_IS_INF(pr80Val))
6877 {
6878 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
6879 pFpuResTwo->r80Result2 = *pr80Val;
6880 }
6881 else
6882 {
6883 if (fFcw & X86_FCW_IM)
6884 {
6885 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6886 pFpuResTwo->r80Result1 = g_r80Indefinite;
6887 else
6888 {
6889 pFpuResTwo->r80Result1 = *pr80Val;
6890 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6891 }
6892 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
6893 }
6894 else
6895 {
6896 pFpuResTwo->r80Result2 = *pr80Val;
6897 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6898 }
6899 fFsw |= X86_FSW_IE;
6900 }
6901 pFpuResTwo->FSW = fFsw;
6902}
6903
6904
6905IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6906 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6907{
6908 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6909 AssertReleaseFailed();
6910}
6911
6912#endif /* IEM_WITHOUT_ASSEMBLY */
6913
6914IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6915 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6916{
6917 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6918}
6919
6920IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6921 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6922{
6923 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6924}
6925
6926#if defined(IEM_WITHOUT_ASSEMBLY)
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6930{
6931 RT_NOREF(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6932 AssertReleaseFailed();
6933}
6934
6935#endif /* IEM_WITHOUT_ASSEMBLY */
6936
6937IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6938 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6939{
6940 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6941}
6942
6943IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6944 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6945{
6946 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6947}
6948
6949
6950/*********************************************************************************************************************************
6951* MMX, SSE & AVX *
6952*********************************************************************************************************************************/
6953
6954IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
6955{
6956 RT_NOREF(pFpuState);
6957 puDst->au32[0] = puSrc->au32[0];
6958 puDst->au32[1] = puSrc->au32[0];
6959 puDst->au32[2] = puSrc->au32[2];
6960 puDst->au32[3] = puSrc->au32[2];
6961}
6962
6963#ifdef IEM_WITH_VEX
6964
6965IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
6966{
6967 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
6968 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
6969 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
6970 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
6971 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6972 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
6973 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6974 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
6975}
6976
6977
6978IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
6979{
6980 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
6981 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
6982 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
6983 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
6984 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
6985 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
6986 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
6987 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
6988}
6989
6990#endif /* IEM_WITH_VEX */
6991
6992
6993IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
6994{
6995 RT_NOREF(pFpuState);
6996 puDst->au32[0] = puSrc->au32[1];
6997 puDst->au32[1] = puSrc->au32[1];
6998 puDst->au32[2] = puSrc->au32[3];
6999 puDst->au32[3] = puSrc->au32[3];
7000}
7001
7002
7003IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, uint64_t uSrc))
7004{
7005 RT_NOREF(pFpuState);
7006 puDst->au64[0] = uSrc;
7007 puDst->au64[1] = uSrc;
7008}
7009
7010#ifdef IEM_WITH_VEX
7011
7012IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7013{
7014 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7015 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7016 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7017 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7018}
7019
7020IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7021{
7022 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7023 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7024 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7025 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7026}
7027
7028#endif /* IEM_WITH_VEX */
7029
7030#ifdef IEM_WITHOUT_ASSEMBLY
7031
7032IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7033{
7034 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7035 AssertReleaseFailed();
7036}
7037
7038
7039IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7040{
7041 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7042 AssertReleaseFailed();
7043}
7044
7045
7046IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7047{
7048 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7049 AssertReleaseFailed();
7050}
7051
7052
7053IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7054{
7055 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7056 AssertReleaseFailed();
7057}
7058
7059
7060IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7061{
7062 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7063 AssertReleaseFailed();
7064}
7065
7066
7067IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7068{
7069 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7070 AssertReleaseFailed();
7071}
7072
7073
7074/*
7075 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7076 */
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7079{
7080 RT_NOREF(pFpuState);
7081 *puDst ^= *puSrc;
7082}
7083
7084
7085IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7086{
7087 RT_NOREF(pFpuState);
7088 puDst->au64[0] ^= puSrc->au64[0];
7089 puDst->au64[1] ^= puSrc->au64[1];
7090}
7091
7092#endif /* IEM_WITHOUT_ASSEMBLY */
7093
7094IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7095 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7096{
7097 RT_NOREF(pExtState);
7098 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7099 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7104 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7105{
7106 RT_NOREF(pExtState);
7107 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7108 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7109 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7110 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7111}
7112
7113#ifdef IEM_WITHOUT_ASSEMBLY
7114
7115IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7116{
7117 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7118 AssertReleaseFailed();
7119
7120}
7121
7122
7123IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, PCRTUINT128U pu128Src))
7124{
7125 RT_NOREF(pFpuState, pu64Dst, pu128Src);
7126 AssertReleaseFailed();
7127}
7128
7129
7130IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src, uint8_t bEvil))
7131{
7132 RT_NOREF(pFpuState, pu64Dst, pu64Src, bEvil);
7133 AssertReleaseFailed();
7134}
7135
7136
7137IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
7138{
7139 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
7140 AssertReleaseFailed();
7141}
7142
7143
7144IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
7145{
7146 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
7147 AssertReleaseFailed();
7148}
7149
7150
7151IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src, uint8_t bEvil))
7152{
7153 RT_NOREF(pFpuState, pu128Dst, pu128Src, bEvil);
7154 AssertReleaseFailed();
7155}
7156
7157/* PUNPCKHxxx */
7158
7159IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7160{
7161 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7162 AssertReleaseFailed();
7163}
7164
7165
7166IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7167{
7168 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7169 AssertReleaseFailed();
7170}
7171
7172
7173IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7174{
7175 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7176 AssertReleaseFailed();
7177}
7178
7179
7180IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7181{
7182 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7183 AssertReleaseFailed();
7184}
7185
7186
7187IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint64_t const *pu64Src))
7188{
7189 RT_NOREF(pFpuState, pu64Dst, pu64Src);
7190 AssertReleaseFailed();
7191}
7192
7193
7194IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7195{
7196 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7197 AssertReleaseFailed();
7198}
7199
7200
7201IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, PCRTUINT128U pu128Src))
7202{
7203 RT_NOREF(pFpuState, pu128Dst, pu128Src);
7204 AssertReleaseFailed();
7205}
7206
7207/* PUNPCKLxxx */
7208
7209IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
7210{
7211 RT_NOREF(pFpuState, pu64Dst, pu32Src);
7212 AssertReleaseFailed();
7213}
7214
7215
7216IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
7217{
7218 RT_NOREF(pFpuState, pu128Dst, pu64Src);
7219 AssertReleaseFailed();
7220}
7221
7222
7223IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
7224{
7225 RT_NOREF(pFpuState, pu64Dst, pu32Src);
7226 AssertReleaseFailed();
7227}
7228
7229
7230IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
7231{
7232 RT_NOREF(pFpuState, pu128Dst, pu64Src);
7233 AssertReleaseFailed();
7234}
7235
7236
7237IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(PCX86FXSTATE pFpuState, uint64_t *pu64Dst, uint32_t const *pu32Src))
7238{
7239 RT_NOREF(pFpuState, pu64Dst, pu32Src);
7240 AssertReleaseFailed();
7241}
7242
7243
7244IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
7245{
7246 RT_NOREF(pFpuState, pu128Dst, pu64Src);
7247 AssertReleaseFailed();
7248}
7249
7250
7251IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U pu128Dst, uint64_t const *pu64Src))
7252{
7253 RT_NOREF(pFpuState, pu128Dst, pu64Src);
7254 AssertReleaseFailed();
7255}
7256
7257#endif /* IEM_WITHOUT_ASSEMBLY */
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette